diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml
index 7eaad09c9f..1d67a9efaf 100644
--- a/.github/workflows/testsuite_allprocesses.yml
+++ b/.github/workflows/testsuite_allprocesses.yml
@@ -19,8 +19,9 @@ on:
     branches: [ master ]
 
   # Trigger the all-processes workflow when new changes to the workflow are pushed
-  push:
-    paths: [ .github/workflows/testsuite* ]
+  # (NB: this is now disabled to avoid triggering two jobs when pushing to a branch for which a PR is opened)
+  ###push:
+  ###  paths: [ .github/workflows/testsuite* ]
 
 #----------------------------------------------------------------------------------------------------------------------------------
 
diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh
index 639991f1cb..d49d70c200 100755
--- a/.github/workflows/testsuite_oneprocess.sh
+++ b/.github/workflows/testsuite_oneprocess.sh
@@ -35,19 +35,23 @@ function codegen() {
     ./CODEGEN/generateAndCompare.sh -q ${proc%.sa}
   fi
   # Check if there are any differences to the current repo
-  ###compare=true # enable comparison to current git repo
-  compare=false # disable comparison to current git repo
-  if [ ${compare} ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then
+  ###compare=1 # enable comparison to current git repo
+  compare=0 # disable comparison to current git repo
+  if [ "${compare}" != "0" ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then
+    echo
+    echo "Compare newly generated code for ${proc} to that in the madgraph4gpu github repository"
     git checkout HEAD ${proc}/CODEGEN*.txt
     if [ "${proc%.mad}" != "${proc}" ]; then
       git checkout HEAD ${proc}/Cards/me5_configuration.txt
       ###sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${proc}/Source/make_opts
       git checkout HEAD ${proc}/Source/make_opts
     fi
-    echo
     echo "git diff (start)"
     git diff --exit-code
     echo "git diff (end)"
+  else
+    echo
+    echo "(SKIP comparison of newly generated code for ${proc} to that in the madgraph4gpu github repository)"
   fi
 }
 
diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index d8c1613ccf..23f61b93fd 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit d8c1613ccf638b5b078a64379e385def5649622c
+Subproject commit 23f61b93fdf268a1cdcbd363cd449c88b3511d7a
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 5b557e832a..5267141530 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -149,6 +149,11 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
     ###helas_exporter = None
     helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341!
 
+    # Default class for the run_card to use
+    from . import launch_plugin
+    run_card_class = launch_plugin.CPPRunCard
+
+
     # AV (default from OM's tutorial) - add a debug printout
     def __init__(self, *args, **kwargs):
         self.in_madevent_mode = False # see MR #747
@@ -203,7 +208,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
            cmdhistory is the list of command used so far.
            MG5options are all the options of the main interface
            outputflags is a list of options provided when doing the output command"""
-        misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self))
+        ###misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self))
         if self.in_madevent_mode:
             self.add_input_for_banner()
             if 'CUDACPP_CODEGEN_PATCHLEVEL' in os.environ: patchlevel = os.environ['CUDACPP_CODEGEN_PATCHLEVEL']
@@ -237,7 +242,8 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
                 raise Exception('ERROR! the O/S call to patchMad.sh failed')
             # Additional patching (OM)
             self.add_madevent_plugin_fct() # Added by OM
-        return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
+        # do not call standard finalize since is this is already done...
+        #return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
 
     # AV (default from OM's tutorial) - overload settings and add a debug printout
     def modify_grouping(self, matrix_element):
diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 1f416ac5b9..f3f830205c 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -152,7 +152,7 @@ function codeGenAndDiff()
   echo -e "\n+++ Generate code for '$proc'\n"
   ###exit 0 # FOR DEBUGGING
   # Vector size for mad/madonly meexporter (VECSIZE_MEMMAX)
-  vecsize=16384 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards)
+  vecsize=32 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards)
   # Generate code for the specific process
   pushd $MG5AMC_HOME >& /dev/null
   mkdir -p ../TMPOUT
@@ -201,13 +201,28 @@ function codeGenAndDiff()
     cat ${outproc}_log.txt | egrep -v '(Crash Annotation)' > ${outproc}_log.txt.new # remove firefox 'glxtest: libEGL initialize failed' errors
     \mv ${outproc}_log.txt.new ${outproc}_log.txt
   fi
+  # Check the code generation log for errors 
+  if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then
+    ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING
+    cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; }
+  else
+    echo "*** ERROR! Code generation failed"
+    cat ${MG5AMC_HOME}/${outproc}_log.txt
+    echo "*** ERROR! Code generation failed"
+    exit 1
+  fi
   # Patches moved here from patchMad.sh after Olivier's PR #764 (THIS IS ONLY NEEDED IN THE MADGRAPH4GPU GIT REPO)  
   if [ "${OUTBCK}" == "mad" ]; then
     # Force the use of strategy SDE=1 in multichannel mode (see #419)
     sed -i 's/2  = sde_strategy/1  = sde_strategy/' ${outproc}/Cards/run_card.dat
+    # Force the use of VECSIZE_MEMMAX=16384
+    sed -i 's/16 = vector_size/16384 = vector_size/' ${outproc}/Cards/run_card.dat
+    # Force the use of fast-math in Fortran builds
+    sed -i 's/-O = global_flag.*/-O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)/' ${outproc}/Cards/run_card.dat
     # Generate run_card.inc and param_card.inc (include stdout and stderr in the code generation log which is later checked for errors)
     # These two steps are part of "cd Source; make" but they actually are code-generating steps
-    ${outproc}/bin/madevent treatcards run  >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug")
+    # Note: treatcards run also regenerates vector.inc if vector_size has changed in the runcard
+    ${outproc}/bin/madevent treatcards run >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug")
     ${outproc}/bin/madevent treatcards param >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug")
     # Cleanup
     \rm -f ${outproc}/crossx.html
@@ -223,37 +238,18 @@ function codeGenAndDiff()
     touch ${outproc}/HTML/.keep # new file
     if [ "${patchlevel}" != "0" ]; then
       # Add global flag '-O3 -ffast-math -fbounds-check' as in previous gridpacks
+      # (FIXME? these flags are already set in the runcards, why are they not propagated to make_opts?)
       echo "GLOBAL_FLAG=-O3 -ffast-math -fbounds-check" > ${outproc}/Source/make_opts.new
       cat ${outproc}/Source/make_opts >> ${outproc}/Source/make_opts.new
       \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts
     fi
     if [ "${patchlevel}" == "2" ]; then
       sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${outproc}/Source/make_opts
-      cat ${outproc}/Source/make_opts | sed '/#end/q' | sort > ${outproc}/Source/make_opts.new
+      cat ${outproc}/Source/make_opts | sed '/#end/q' | head --lines=-1 | sort > ${outproc}/Source/make_opts.new
       cat ${outproc}/Source/make_opts | sed -n -e '/#end/,$p' >> ${outproc}/Source/make_opts.new
       \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts
-      echo "
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
-
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA" >> ${outproc}/Cards/run_card.dat
     fi
   fi
-  # Check the code generation log for errors 
-  if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then
-    ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING
-    cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; }
-  else
-    echo "*** ERROR! Code generation failed"
-    cat ${MG5AMC_HOME}/${outproc}_log.txt
-    echo "*** ERROR! Code generation failed"
-    exit 1
-  fi
   popd >& /dev/null
   # Choose which directory must be copied (for gridpack generation: untar and modify the gridpack)
   if [ "${SCRBCK}" == "gridpack" ]; then
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index e6546f684c..e090137829 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005372047424316406 
+DEBUG: model prefixing  takes 0.005680561065673828 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -156,15 +156,15 @@ INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Process has 2 diagrams 
 1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards 
@@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f64657c44f0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f7ba46f2b80> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -183,27 +183,27 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 WARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.098 s
+Wrote files for 8 helas calls in 0.097 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates FFV2 routines
 ALOHA: aloha creates FFV4 routines
-ALOHA: aloha creates 3 routines in  0.200 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+ALOHA: aloha creates 3 routines in  0.199 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates FFV2 routines
 ALOHA: aloha creates FFV4 routines
 ALOHA: aloha creates FFV2_4 routines
-ALOHA: aloha creates 7 routines in  0.255 s
+ALOHA: aloha creates 7 routines in  0.254 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -226,7 +226,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -242,16 +241,16 @@ patching file matrix1.f
 Hunk #3 succeeded at 230 (offset 9 lines).
 Hunk #4 succeeded at 267 (offset 18 lines).
 Hunk #5 succeeded at 312 (offset 18 lines).
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.853s
-user	0m1.653s
-sys	0m0.201s
+real	0m1.848s
+user	0m1.621s
+sys	0m0.225s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -279,10 +278,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index 618adbca06..22e76563ab 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --\
-vector_size=16384 --me_exporter=standalone_cudacpp
+vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
index 86f6a33258..1084532333 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
@@ -85,7 +85,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -182,12 +198,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
index 67c1a4de28..0fa0402261 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
@@ -85,7 +85,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -181,3 +197,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/ee_mumu.mad/mg5.in b/epochX/cudacpp/ee_mumu.mad/mg5.in
index 12a2c58512..4e83015b40 100644
--- a/epochX/cudacpp/ee_mumu.mad/mg5.in
+++ b/epochX/cudacpp/ee_mumu.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate e+ e- > mu+ mu-
-output madevent ee_mumu.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ee_mumu.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 8cb80f0d38..3447ea61e3 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005633831024169922 
+DEBUG: model prefixing  takes 0.0055027008056640625 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -154,34 +154,34 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.005 s
+1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 Load PLUGIN.CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  plugin [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates FFV2 routines
 ALOHA: aloha creates FFV4 routines
 ALOHA: aloha creates FFV2_4 routines
-ALOHA: aloha creates 4 routines in  0.267 s
+ALOHA: aloha creates 4 routines in  0.270 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -198,9 +198,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 quit
 
-real	0m3.653s
-user	0m0.601s
-sys	0m0.049s
+real	0m0.657s
+user	0m0.602s
+sys	0m0.050s
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index a1fa47508f..3326a8488f 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005694150924682617 
+DEBUG: model prefixing  takes 0.005394458770751953 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -157,15 +157,15 @@ INFO: Trying process: g g > t t~ WEIGHTED<=2 @1
 INFO: Process has 3 diagrams 
 1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards 
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f560fa1e7c0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5f9e472700> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,23 +184,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.101 s
+Wrote files for 10 helas calls in 0.102 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0
 ALOHA: aloha creates FFV1 routines
-ALOHA: aloha creates 2 routines in  0.145 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+ALOHA: aloha creates 2 routines in  0.142 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0
 ALOHA: aloha creates FFV1 routines
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.138 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -219,7 +219,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -231,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.772s
-user	0m1.470s
-sys	0m0.223s
+real	0m1.677s
+user	0m1.453s
+sys	0m0.213s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -268,9 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 4c14989a3f..cf111e2e6d 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --ve\
-ctor_size=16384 --me_exporter=standalone_cudacpp
+ctor_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 80d07f317e..a6463dc262 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -140,12 +156,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index 48d84c73f0..27e990a016 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -139,3 +155,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_tt.mad/mg5.in b/epochX/cudacpp/gg_tt.mad/mg5.in
index 7859bf9b80..b4b356fc51 100644
--- a/epochX/cudacpp/gg_tt.mad/mg5.in
+++ b/epochX/cudacpp/gg_tt.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~
-output madevent gg_tt.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_tt.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 805df19bd9..f09b4fa669 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.00567626953125 
+DEBUG: model prefixing  takes 0.00550079345703125 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -161,26 +161,26 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  plugin [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0
 ALOHA: aloha creates FFV1 routines
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.146 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -193,9 +193,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 quit
 
-real	0m3.529s
-user	0m0.478s
-sys	0m0.048s
+real	0m0.583s
+user	0m0.466s
+sys	0m0.061s
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 9d4dbd85f0..acacaf4036 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005400419235229492 
+DEBUG: model prefixing  takes 0.0055658817291259766 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -165,15 +165,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2
 INFO: Process has 16 diagrams 
 1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards 
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ec5c70> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbd240261c0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ecb2b0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbd23fb5fd0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -211,29 +211,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  1 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
-Wrote files for 46 helas calls in 0.242 s
+Wrote files for 46 helas calls in 0.245 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 set of routines with options: P0
 ALOHA: aloha creates VVVV3 set of routines with options: P0
 ALOHA: aloha creates VVVV4 set of routines with options: P0
-ALOHA: aloha creates 5 routines in  0.324 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+ALOHA: aloha creates 5 routines in  0.321 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 set of routines with options: P0
 ALOHA: aloha creates VVVV3 set of routines with options: P0
 ALOHA: aloha creates VVVV4 set of routines with options: P0
-ALOHA: aloha creates 10 routines in  0.308 s
+ALOHA: aloha creates 10 routines in  0.304 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -257,7 +257,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -277,16 +276,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.282s
-user	0m2.049s
-sys	0m0.227s
+real	0m2.697s
+user	0m2.018s
+sys	0m0.247s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -314,9 +313,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index d0845f65f5..06ea2195d8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -47,4 +47,4 @@ define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 add process g g > t t~ g
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False -\
--vector_size=16384 --me_exporter=standalone_cudacpp
+-vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
index 3360456835..f7c6502eec 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
@@ -98,7 +98,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -161,12 +177,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
index 46378139c8..0e45d1a8c9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
@@ -98,7 +98,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -160,3 +176,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_tt01g.mad/mg5.in b/epochX/cudacpp/gg_tt01g.mad/mg5.in
index a20e166e81..95984fcf10 100644
--- a/epochX/cudacpp/gg_tt01g.mad/mg5.in
+++ b/epochX/cudacpp/gg_tt01g.mad/mg5.in
@@ -2,4 +2,4 @@ set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~
 add process g g > t t~ g
-output madevent gg_tt01g.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_tt01g.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 68afa8d9b0..b52dc31122 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005378007888793945 
+DEBUG: model prefixing  takes 0.0053288936614990234 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -157,15 +157,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Process has 16 diagrams 
 1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards 
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fef242fb1f0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f705b94f700> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-Wrote files for 36 helas calls in 0.148 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+Wrote files for 36 helas calls in 0.155 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 set of routines with options: P0
 ALOHA: aloha creates VVVV3 set of routines with options: P0
 ALOHA: aloha creates VVVV4 set of routines with options: P0
-ALOHA: aloha creates 5 routines in  0.323 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+ALOHA: aloha creates 5 routines in  0.326 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 set of routines with options: P0
 ALOHA: aloha creates VVVV3 set of routines with options: P0
 ALOHA: aloha creates VVVV4 set of routines with options: P0
-ALOHA: aloha creates 10 routines in  0.309 s
+ALOHA: aloha creates 10 routines in  0.327 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -230,7 +230,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -246,16 +245,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.147s
-user	0m1.924s
-sys	0m0.225s
+real	0m2.189s
+user	0m1.943s
+sys	0m0.223s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -283,9 +282,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index a0ffbbc219..9d09090869 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --v\
-ector_size=16384 --me_exporter=standalone_cudacpp
+ector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
index a9fbb5a212..f119b5a1c7 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -143,12 +159,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
index 1464f610ba..e89c78702d 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -142,3 +158,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_ttg.mad/mg5.in b/epochX/cudacpp/gg_ttg.mad/mg5.in
index 98f53ce50d..e37d10d865 100644
--- a/epochX/cudacpp/gg_ttg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g
-output madevent gg_ttg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_ttg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 97056958fe..b9716683be 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005817890167236328 
+DEBUG: model prefixing  takes 0.005490303039550781 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  plugin [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 set of routines with options: P0
 ALOHA: aloha creates VVVV3 set of routines with options: P0
 ALOHA: aloha creates VVVV4 set of routines with options: P0
-ALOHA: aloha creates 5 routines in  0.326 s
+ALOHA: aloha creates 5 routines in  0.322 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -201,9 +201,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 quit
 
-real	0m3.779s
-user	0m0.713s
-sys	0m0.062s
+real	0m0.784s
+user	0m0.699s
+sys	0m0.063s
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index eacd7a356a..ee3d38dfb1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.0053293704986572266 
+DEBUG: model prefixing  takes 0.00561833381652832 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.157 s
+1 processes with 123 diagrams generated in 0.154 s
 Total: 1 processes with 123 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards 
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f56507006d0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f38a34ccfa0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s
-Wrote files for 222 helas calls in 0.691 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s
+Wrote files for 222 helas calls in 0.679 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 5 routines in  0.333 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+ALOHA: aloha creates 5 routines in  0.323 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 10 routines in  0.316 s
+ALOHA: aloha creates 10 routines in  0.314 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -233,7 +233,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -249,15 +248,15 @@ Hunk #2 succeeded at 191 (offset 48 lines).
 Hunk #3 succeeded at 269 (offset 48 lines).
 Hunk #4 succeeded at 297 (offset 48 lines).
 Hunk #5 succeeded at 342 (offset 48 lines).
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m6.262s
-user	0m3.028s
+real	0m3.226s
+user	0m2.976s
 sys	0m0.232s
 ************************************************************
 *                                                          *
@@ -286,9 +285,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index b7568d1a73..ea9cfcde68 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --\
-vector_size=16384 --me_exporter=standalone_cudacpp
+vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
index 88d4377d71..42728a48f3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -168,12 +184,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
index 26fa7f39dd..0374d7ba60 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -167,3 +183,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_ttgg.mad/mg5.in b/epochX/cudacpp/gg_ttgg.mad/mg5.in
index e2c5858b63..53784bf161 100644
--- a/epochX/cudacpp/gg_ttgg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttgg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g g
-output madevent gg_ttgg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_ttgg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 80631c94bf..d62aabc436 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.00567317008972168 
+DEBUG: model prefixing  takes 0.005473136901855469 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  plugin [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 5 routines in  0.318 s
+ALOHA: aloha creates 5 routines in  0.894 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,9 +204,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 quit
 
-real	0m4.435s
-user	0m1.373s
+real	0m2.023s
+user	0m1.361s
 sys	0m0.056s
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index ab3974344c..d94e7252af 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005319833755493164 
+DEBUG: model prefixing  takes 0.005348920822143555 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.855 s
+1 processes with 1240 diagrams generated in 1.857 s
 Total: 1 processes with 1240 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards 
@@ -175,9 +175,9 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
+INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe481da42e0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f19857558e0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -186,29 +186,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.574 s
-Wrote files for 2281 helas calls in 18.431 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.607 s
+Wrote files for 2281 helas calls in 18.169 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 5 routines in  0.335 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+ALOHA: aloha creates 5 routines in  0.317 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 10 routines in  0.313 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -235,7 +235,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -251,16 +250,16 @@ Hunk #2 succeeded at 255 (offset 112 lines).
 Hunk #3 succeeded at 333 (offset 112 lines).
 Hunk #4 succeeded at 361 (offset 112 lines).
 Hunk #5 succeeded at 406 (offset 112 lines).
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m32.103s
-user	0m28.586s
-sys	0m0.412s
+real	0m28.894s
+user	0m28.357s
+sys	0m0.382s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -288,9 +287,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index 2f92ecc4ba..3923568dd8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False -\
--vector_size=16384 --me_exporter=standalone_cudacpp
+-vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
index 0a6bf20eb9..8170176a11 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -174,12 +190,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
index 3714e71997..8da9ac1563 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -173,3 +189,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_ttggg.mad/mg5.in b/epochX/cudacpp/gg_ttggg.mad/mg5.in
index cdbc845cdd..f92d17d219 100644
--- a/epochX/cudacpp/gg_ttggg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttggg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g g g
-output madevent gg_ttggg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_ttggg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 33bae20142..8660fec52a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005532503128051758 
+DEBUG: model prefixing  takes 0.005609273910522461 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.880 s
+1 processes with 1240 diagrams generated in 1.857 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  plugin [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.540 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.536 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 5 routines in  0.351 s
+ALOHA: aloha creates 5 routines in  0.341 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,9 +204,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 quit
 
-real	0m15.959s
-user	0m12.810s
-sys	0m0.102s
+real	0m12.866s
+user	0m12.717s
+sys	0m0.100s
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 89cb2749b0..97f5e25170 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.0057373046875 
+DEBUG: model prefixing  takes 0.005373477935791016 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -170,17 +170,17 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards 
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395a95ddc0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc08deefc40> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -207,15 +207,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395aad79d0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc08dd9ac10> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -224,19 +224,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  1 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-Wrote files for 32 helas calls in 0.217 s
+Wrote files for 32 helas calls in 0.219 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates 2 routines in  0.144 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVV1 routines
@@ -260,7 +260,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -288,16 +287,16 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 247 (offset 26 lines).
 Hunk #4 succeeded at 281 (offset 32 lines).
 Hunk #5 succeeded at 326 (offset 32 lines).
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.915s
-user	0m1.680s
-sys	0m0.237s
+real	0m1.929s
+user	0m1.701s
+sys	0m0.227s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -325,9 +324,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index efb0752a31..c49c39c3c4 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -48,4 +48,4 @@ define vl~ = ve~ vm~ vt~
 define q = u c d s u~ c~ d~ s~
 generate g q > t t~ q
 output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --v\
-ector_size=16384 --me_exporter=standalone_cudacpp
+ector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
index 69e6fa2bf2..d987e643f0 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
@@ -82,7 +82,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -145,12 +161,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
index 4c99f39248..ef9e4b2707 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
@@ -82,7 +82,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -144,3 +160,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts
+++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gq_ttq.mad/mg5.in b/epochX/cudacpp/gq_ttq.mad/mg5.in
index e93843b8cd..2273ae9cfd 100644
--- a/epochX/cudacpp/gq_ttq.mad/mg5.in
+++ b/epochX/cudacpp/gq_ttq.mad/mg5.in
@@ -2,4 +2,4 @@ set stdout_level DEBUG
 set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 generate g q > t t~ q
-output madevent gq_ttq.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gq_ttq.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 16374bd28e..91509797eb 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.005791902542114258 
+DEBUG: model prefixing  takes 0.005265951156616211 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  plugin [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
@@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=1 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=1 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVV1 routines
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.141 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,9 +225,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 quit
 
-real	0m3.656s
-user	0m0.594s
-sys	0m0.059s
+real	0m0.640s
+user	0m0.574s
+sys	0m0.061s
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 3b04fc3fb3..452dbff73e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -135,25 +135,25 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h
 Load PLUGIN.CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  plugin [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 
-DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189] 
-DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 190] 
-DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 191] 
-DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 192] 
+DEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194] 
+DEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [output.py at line 195] 
+DEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [output.py at line 196] 
+DEBUG:    type(me)=<class 'int'> me=0 [output.py at line 197] 
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
 Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines
-ALOHA: aloha creates 1 routines in  0.062 s
+ALOHA: aloha creates 1 routines in  0.060 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -163,9 +163,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 quit
 
-real	0m3.422s
-user	0m0.371s
-sys	0m0.048s
+real	0m0.415s
+user	0m0.358s
+sys	0m0.050s
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 8b6ca99446..e73dd42300 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-DEBUG: model prefixing  takes 0.00538325309753418 
+DEBUG: model prefixing  takes 0.0053882598876953125 
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 DEBUG: Simplifying conditional expressions 
 DEBUG: remove interactions: u s w+ at order: QED=1 
@@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.029 s
+5 processes with 7 diagrams generated in 0.031 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.134 s
+13 processes with 76 diagrams generated in 0.141 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.811 s
+65 processes with 1119 diagrams generated in 1.922 s
 Total: 83 processes with 1202 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT
 Output will be done with PLUGIN: CUDACPP_OUTPUT
 DEBUG:  cformat =  standalone_cudacpp [export_cpp.py at line 3071] 
-DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155] 
+DEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160] 
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160] 
+DEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165] 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
 WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards 
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33336d4c40> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -506,15 +506,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  0 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33336d4430> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -523,15 +523,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  1 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a1deb0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -540,15 +540,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  2 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f333422a1c0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -557,15 +557,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  3 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333864250> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -574,15 +574,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  4 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333864250> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -591,15 +591,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  5 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e793a00> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a24ca0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -608,15 +608,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  6 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33339e5be0> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -625,15 +625,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  7 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333ee8f70> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -642,15 +642,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  8 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -659,15 +659,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  9 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a1dd60> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -676,15 +676,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  10 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33339c4d30> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -693,15 +693,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  11 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -710,15 +710,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  12 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -727,15 +727,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  13 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7ddac0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -744,15 +744,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  14 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7a9640> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33339e3a00> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -761,15 +761,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  15 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e79d2e0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a1dd60> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -778,15 +778,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1, 2, 3] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  16 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 DEBUG:  kwargs[prefix] = 0 [model_handling.py at line 1058] 
-DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1eb8eee0> [export_v4.py at line 6262] 
+DEBUG:  process_exporter_cpp =  <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33336d4c40> [export_v4.py at line 6262] 
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -795,29 +795,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 DEBUG:  config_map =  [1] [export_cpp.py at line 711] 
 DEBUG:  subproc_number =  17 [export_cpp.py at line 712] 
 DEBUG:  Done [export_cpp.py at line 713] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
-DEBUG:  vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
+DEBUG:  vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872] 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.267 s
-Wrote files for 810 helas calls in 3.215 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.279 s
+Wrote files for 810 helas calls in 3.193 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 5 routines in  0.333 s
-DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197] 
+ALOHA: aloha creates 5 routines in  0.331 s
+DEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202] 
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines
 ALOHA: aloha creates FFV1 routines
 ALOHA: aloha creates VVVV1 routines
 ALOHA: aloha creates VVVV3 routines
 ALOHA: aloha creates VVVV4 routines
-ALOHA: aloha creates 10 routines in  0.312 s
+ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -844,7 +844,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [output.py at line 206] 
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -1022,16 +1021,16 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-DEBUG:  p.returncode =  0 [output.py at line 232] 
+DEBUG:  p.returncode =  0 [output.py at line 237] 
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m11.764s
-user	0m8.242s
-sys	0m0.480s
+real	0m8.865s
+user	0m8.358s
+sys	0m0.477s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -1059,9 +1058,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-run_card missed argument cudacpp_backend. Takes default: CPP
-run_card missed argument cudacpp_backend. Takes default: CPP
-WARNING! CPPRunCard instance has no attribute path
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index c0b1a2fd98..22837639b6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -50,4 +50,4 @@ generate p p > t t~ @0
 add process p p > t t~ j @1
 add process p p > t t~ j j @2
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False \
---vector_size=16384 --me_exporter=standalone_cudacpp
+--vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
index f4ca7c0b5d..8b1007a3e5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
@@ -98,7 +98,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -186,12 +202,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
index 58e12c11d7..2f72b65255 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
@@ -98,7 +98,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -185,3 +201,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/pp_tt012j.mad/mg5.in b/epochX/cudacpp/pp_tt012j.mad/mg5.in
index 91e22f5295..6bc40e7968 100644
--- a/epochX/cudacpp/pp_tt012j.mad/mg5.in
+++ b/epochX/cudacpp/pp_tt012j.mad/mg5.in
@@ -4,4 +4,4 @@ define j = p
 generate p p > t t~ @0
 add process p p > t t~ j @1
 add process p p > t t~ j j @2
-output madevent pp_tt012j.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent pp_tt012j.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp