diff --git a/dev_requirements.txt b/dev_requirements.txt
index 645d3b8eee6..afc8bbefd42 100644
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -1,6 +1,6 @@
 jupyter<1.0.1
 packaging<25.0
-pytest<8.3.2
+pytest<8.3.3
 pytest-benchmark<4.0.1
 pytest-cov<5.0.1
 pytest-forked<1.7.0
diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
index 2e157d7b324..5ff6359225e 100644
--- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
+++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -2096,44 +2096,33 @@ def _function(self,
             # if ocm is not None and ocm.parameters.comp_execution_mode._get(context) in {"PTX", "LLVM"}:
             if ocm is not None and ocm.parameters.comp_execution_mode._get(context) in {"PTX", "LLVM"}:
 
-                # If we have a numpy array, convert back to ctypes
-                if isinstance(all_values, np.ndarray):
-                    ct_values = all_values.flatten().ctypes.data_as(ctypes.POINTER(ctypes.c_double))
-                    num_values = len(all_values.flatten())
-                else:
-                    ct_values = all_values
-                    num_values = len(ct_values)
+                ct_values = all_values
+                num_values = len(ct_values)
 
                 # Reduce array of values to min/max
                 # select_min params are:
                 # params, state, min_sample_ptr, sample_ptr, min_value_ptr, value_ptr, opt_count_ptr, count
                 min_tags = frozenset({"select_min", "evaluate_type_objective"})
-                bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags)
+                bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, numpy_args=(2, 4, 6))
+
                 ct_param = bin_func.byref_arg_types[0](*self._get_param_initializer(context))
                 ct_state = bin_func.byref_arg_types[1](*self._get_state_initializer(context))
-                ct_opt_sample = bin_func.byref_arg_types[2](float("NaN"))
-                ct_alloc = None # NULL for samples
-                ct_opt_value = bin_func.byref_arg_types[4]()
-                ct_opt_count = bin_func.byref_arg_types[6](0)
-                ct_start = bin_func.c_func.argtypes[7](0)
-                ct_stop = bin_func.c_func.argtypes[8](num_values)
-
-                bin_func(ct_param, ct_state, ct_opt_sample, ct_alloc, ct_opt_value,
-                         ct_values, ct_opt_count, ct_start, ct_stop)
-
-                optimal_value = ct_opt_value.value
-                optimal_sample = np.ctypeslib.as_array(ct_opt_sample)
-
-                if not isinstance(all_values, np.ndarray):
-                    all_values = np.ctypeslib.as_array(ct_values)
-
-                # These are normally stored in the parent function (OptimizationFunction).
-                # Since we didn't  call super()._function like the python path,
-                # save the values here
-                if self.parameters.save_samples._get(context):
-                    self.parameters.saved_samples._set(all_samples, context)
-                if self.parameters.save_values._get(context):
-                    self.parameters.saved_values._set(all_values, context)
+                optimal_sample = bin_func.np_buffer_for_arg(2)
+                optimal_value = bin_func.np_buffer_for_arg(4)
+                number_of_optimal_values = bin_func.np_buffer_for_arg(6, fill_value=0)
+
+                bin_func(ct_param,
+                         ct_state,
+                         optimal_sample,
+                         None,                                     # samples. NULL, it's generated by the function.
+                         optimal_value,
+                         ct_values,
+                         number_of_optimal_values,
+                         bin_func.c_func.argtypes[7](0),           # start
+                         bin_func.c_func.argtypes[8](num_values))  # stop
+
+                # Convert outputs to Numpy/Python
+                all_values = np.ctypeslib.as_array(ct_values)
 
             # Python version
             else:
@@ -2153,6 +2142,12 @@ def _function(self,
                                          [all_samples[:,i] for i in range(all_samples.shape[1])])
                 optimal_value, optimal_sample = next(value_sample_pairs)
 
+                # The algorithm below implements "Reservoir sampling"[0]. This
+                # matches the compiled implementation of "select_min". The
+                # advantage of reservoir sampling is constant memory requirements
+                # and a single pass over the evaluated values.
+                # The disadvantage is multiple calls to the PRNG.
+                # https://en.wikipedia.org/wiki/Reservoir_sampling
                 select_randomly = self.parameters.select_randomly_from_optimal_values._get(context)
                 for value, sample in value_sample_pairs:
                     if select_randomly and np.allclose(value, optimal_value):
diff --git a/psyneulink/core/compositions/composition.py b/psyneulink/core/compositions/composition.py
index 804ed9f6f45..79177e1c3b2 100644
--- a/psyneulink/core/compositions/composition.py
+++ b/psyneulink/core/compositions/composition.py
@@ -11681,7 +11681,8 @@ def _execute_controller(self,
                     assert (execution_mode == pnlvm.ExecutionMode.LLVM
                             or execution_mode & pnlvm.ExecutionMode._Fallback),\
                         f"PROGRAM ERROR: Unrecognized compiled execution_mode: '{execution_mode}'."
-                    _comp_ex.execute_node(self.controller, context=context)
+                    _comp_ex.freeze_values()
+                    _comp_ex.execute_node(self.controller)
 
                 context.remove_flag(ContextFlags.PROCESSING)
 
@@ -12010,7 +12011,7 @@ def execute(
                 build_CIM_input = self._build_variable_for_input_CIM(inputs)
 
             if execution_mode & pnlvm.ExecutionMode.COMPILED:
-                _comp_ex.execute_node(self.input_CIM, inputs, context)
+                _comp_ex.execute_node(self.input_CIM, inputs)
                 # FIXME: parameter_CIM should be executed here as well,
                 #        but node execution of nested compositions with
                 #        outside control is not supported yet.
@@ -12295,7 +12296,7 @@ def execute(
 
                         # Execute Mechanism
                         if execution_mode & pnlvm.ExecutionMode.COMPILED:
-                            _comp_ex.execute_node(node, context=context)
+                            _comp_ex.execute_node(node)
                         else:
                             if node is not self.controller:
                                 mech_context = copy(context)
@@ -12507,7 +12508,7 @@ def execute(
             # Extract result here
             if execution_mode & pnlvm.ExecutionMode.COMPILED:
                 _comp_ex.freeze_values()
-                _comp_ex.execute_node(self.output_CIM, context=context)
+                _comp_ex.execute_node(self.output_CIM)
                 report(self,
                        PROGRESS_REPORT,
                        report_num=report_num,
diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py
index bd931413e9b..568ef7ec910 100644
--- a/psyneulink/core/llvm/__init__.py
+++ b/psyneulink/core/llvm/__init__.py
@@ -23,7 +23,7 @@
 
 from . import codegen
 from .builder_context import *
-from .builder_context import _all_modules, _convert_llvm_ir_to_ctype
+from .builder_context import _all_modules, _convert_llvm_ir_to_ctype, _convert_llvm_ir_to_dtype
 from .debug import debug_env
 from .execution import *
 from .execution import _tupleize
@@ -123,7 +123,7 @@ def _llvm_build(target_generation=_binary_generation + 1):
 
 
 class LLVMBinaryFunction:
-    def __init__(self, name: str):
+    def __init__(self, name: str, *, numpy_args=()):
         self.name = name
 
         self.__c_func = None
@@ -143,17 +143,25 @@ def __init__(self, name: str):
         # Create ctype function instance
         start = time.perf_counter()
         return_type = _convert_llvm_ir_to_ctype(f.return_value.type)
-        params = [_convert_llvm_ir_to_ctype(a.type) for a in f.args]
+        args = [_convert_llvm_ir_to_ctype(a.type) for a in f.args]
+
+        # '_type_' special attribute stores pointee type for pointers
+        # https://docs.python.org/3/library/ctypes.html#ctypes._Pointer._type_
+        self.byref_arg_types = [a._type_ if hasattr(a, "contents") else None for a in args]
+        self.np_params = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args]
+
+        for a in numpy_args:
+            assert self.byref_arg_types[a] is not None
+            args[a] = np.ctypeslib.ndpointer(dtype=self.np_params[a].base, shape=self.np_params[a].shape)
+
         middle = time.perf_counter()
-        self.__c_func_type = ctypes.CFUNCTYPE(return_type, *params)
+        self.__c_func_type = ctypes.CFUNCTYPE(return_type, *args)
         finish = time.perf_counter()
 
         if "time_stat" in debug_env:
             print("Time to create ctype function '{}': {} ({} to create types)".format(
                   name, finish - start, middle - start))
 
-        self.byref_arg_types = [p._type_ for p in params]
-
     @property
     def c_func(self):
         if self.__c_func is None:
@@ -218,18 +226,26 @@ def cuda_wrap_call(self, *args, **kwargs):
         wrap_args = (jit_engine.pycuda.driver.InOut(a) if isinstance(a, np.ndarray) else a for a in args)
         self.cuda_call(*wrap_args, **kwargs)
 
+    def np_buffer_for_arg(self, arg_num, *, extra_dimensions=(), fill_value=np.nan):
+
+        out_base = self.np_params[arg_num].base
+        out_shape = extra_dimensions + self.np_params[arg_num].shape
+
+        # fill the buffer with NaN poison
+        return np.full(out_shape, fill_value, dtype=out_base)
+
     @staticmethod
     @functools.lru_cache(maxsize=32)
-    def from_obj(obj, *, tags:frozenset=frozenset()):
+    def from_obj(obj, *, tags:frozenset=frozenset(), numpy_args:tuple=()):
         name = LLVMBuilderContext.get_current().gen_llvm_function(obj, tags=tags).name
-        return LLVMBinaryFunction.get(name)
+        return LLVMBinaryFunction.get(name, numpy_args=numpy_args)
 
     @staticmethod
     @functools.lru_cache(maxsize=32)
-    def get(name: str):
-        return LLVMBinaryFunction(name)
+    def get(name: str, *, numpy_args:tuple=()):
+        return LLVMBinaryFunction(name, numpy_args=numpy_args)
 
-    def get_multi_run(self):
+    def get_multi_run(self, *, numpy_args=()):
         try:
             multirun_llvm = _find_llvm_function(self.name + "_multirun")
         except ValueError:
@@ -237,7 +253,7 @@ def get_multi_run(self):
             with LLVMBuilderContext.get_current() as ctx:
                 multirun_llvm = codegen.gen_multirun_wrapper(ctx, function)
 
-        return LLVMBinaryFunction.get(multirun_llvm.name)
+        return LLVMBinaryFunction.get(multirun_llvm.name, numpy_args=numpy_args)
 
 
 _cpu_engine = None
diff --git a/psyneulink/core/llvm/builder_context.py b/psyneulink/core/llvm/builder_context.py
index a4dd418f6f7..edc77fddad9 100644
--- a/psyneulink/core/llvm/builder_context.py
+++ b/psyneulink/core/llvm/builder_context.py
@@ -52,7 +52,7 @@ def module_count():
                                  'mt_rand_init', 'philox_rand_init'))
 
 
-class _node_wrapper():
+class _node_assembly():
     def __init__(self, composition, node):
         self._comp = weakref.proxy(composition)
         self._node = node
@@ -61,7 +61,7 @@ def __repr__(self):
         return "Node wrapper for node '{}' in composition '{}'".format(self._node, self._comp)
 
     def _gen_llvm_function(self, *, ctx, tags:frozenset):
-        return codegen.gen_node_wrapper(ctx, self._comp, self._node, tags=tags)
+        return codegen.gen_node_assembly(ctx, self._comp, self._node, tags=tags)
 
 def _comp_cached(func):
     @functools.wraps(func)
@@ -349,6 +349,13 @@ def get_state_space(self, builder, component, state_ptr, param):
         return helpers.get_state_space(builder, component, state_ptr, param_name)
 
     def check_used_params(self, component, *, tags:frozenset):
+        """
+        This function checks that parameters included in the compiled structures are used in compiled code.
+
+        If the assertion in this function triggers the parameter name should be added to the parameter
+        block list in the Component class.
+        """
+
         # Skip the check if the parameter use is not tracked. Some components (like node wrappers)
         # don't even have parameters.
         if component not in self._component_state_use and component not in self._component_param_use:
@@ -378,12 +385,6 @@ def check_used_params(self, component, *, tags:frozenset):
         if hasattr(component, 'evaluate_agent_rep'):
             used_param_ids.add('num_trials_per_estimate')
 
-        if hasattr(component, 'adapt_scale'):
-            used_param_ids.add('threshold')
-            used_param_ids.add('adapt_scale')
-            used_param_ids.add('adapt_base')
-            used_param_ids.add('adapt_entropy_weighting')
-
         unused_param_ids = component_param_ids - used_param_ids - initializers
         unused_state_ids = component_state_ids - used_state_ids
 
@@ -504,38 +505,37 @@ def get_data_struct_type(self, component):
 
         return ir.LiteralStructType([])
 
-    def get_node_wrapper(self, composition, node):
-        cache = getattr(composition, '_wrapped_nodes', None)
+    def get_node_assembly(self, composition, node):
+        cache = getattr(composition, '_node_assemblies', None)
         if cache is None:
             cache = weakref.WeakKeyDictionary()
-            setattr(composition, '_wrapped_nodes', cache)
-        return cache.setdefault(node, _node_wrapper(composition, node))
+            setattr(composition, '_node_assemblies', cache)
+        return cache.setdefault(node, _node_assembly(composition, node))
 
     def convert_python_struct_to_llvm_ir(self, t):
         self._stats["types_converted"] += 1
         if t is None:
             return ir.LiteralStructType([])
-        elif type(t) is list:
-            if len(t) == 0:
-                return ir.LiteralStructType([])
-            elems_t = [self.convert_python_struct_to_llvm_ir(x) for x in t]
-            if all(x == elems_t[0] for x in elems_t):
-                return ir.ArrayType(elems_t[0], len(elems_t))
-            return ir.LiteralStructType(elems_t)
-        elif type(t) is tuple:
+
+        elif isinstance(t, (list, tuple)):
             elems_t = [self.convert_python_struct_to_llvm_ir(x) for x in t]
             if len(elems_t) > 0 and all(x == elems_t[0] for x in elems_t):
                 return ir.ArrayType(elems_t[0], len(elems_t))
+
             return ir.LiteralStructType(elems_t)
+
         elif isinstance(t, enum.Enum):
             # FIXME: Consider enums of non-int type
             assert all(round(x.value) == x.value for x in type(t))
             return self.int32_ty
+
         elif isinstance(t, (int, float, np.floating)):
             return self.float_ty
+
         elif isinstance(t, np.integer):
             # Python 'int' is handled above as it is the default type for '0'
             return ir.IntType(t.nbytes * 8)
+
         elif isinstance(t, np.ndarray):
             # 0d uint32 values were likely created from enums (above) and are
             # observed here after compilation sync.
@@ -543,18 +543,24 @@ def convert_python_struct_to_llvm_ir(self, t):
             if t.ndim == 0 and t.dtype == np.uint32:
                 return self.convert_python_struct_to_llvm_ir(t.reshape(1)[0])
             return self.convert_python_struct_to_llvm_ir(t.tolist())
+
         elif isinstance(t, np.random.RandomState):
             return pnlvm.builtins.get_mersenne_twister_state_struct(self)
+
         elif isinstance(t, np.random.Generator):
             assert isinstance(t.bit_generator, np.random.Philox)
             return pnlvm.builtins.get_philox_state_struct(self)
+
         elif isinstance(t, Time):
             return ir.ArrayType(self.int32_ty, len(TimeScale))
+
         elif isinstance(t, SampleIterator):
             if isinstance(t.generator, list):
                 return ir.ArrayType(self.float_ty, len(t.generator))
+
             # Generic iterator is {start, increment, count}
             return ir.LiteralStructType((self.float_ty, self.float_ty, self.int32_ty))
+
         assert False, "Don't know how to convert {}".format(type(t))
 
 
@@ -765,3 +771,53 @@ def _convert_llvm_ir_to_ctype(t: ir.Type):
         assert False, "Don't know how to convert LLVM type: {}".format(t)
 
     return ret_t
+
+@functools.lru_cache(maxsize=16)
+def _convert_llvm_ir_to_dtype(t: ir.Type):
+
+    if isinstance(t, ir.IntType):
+        if t.width == 8:
+            return np.uint8().dtype
+
+        elif t.width == 16:
+            return np.uint16().dtype
+
+        elif t.width == 32:
+            return np.uint32().dtype
+
+        elif t.width == 64:
+            return np.uint64().dtype
+
+        else:
+            assert False, "Unsupported integer type: {}".format(type(t))
+
+    elif isinstance(t, ir.DoubleType):
+        return np.float64().dtype
+
+    elif isinstance(t, ir.FloatType):
+        return np.float32().dtype
+
+    elif isinstance(t, ir.HalfType):
+        return np.float16().dtype
+
+    elif isinstance(t, ir.ArrayType):
+        element_type = _convert_llvm_ir_to_dtype(t.element)
+
+        # Create multidimensional array instead of nesting
+        if element_type.subdtype is not None:
+            element_type, shape = element_type.subdtype
+        else:
+            shape = ()
+
+        ret_t = np.dtype((element_type, (len(t),) + shape))
+
+    elif isinstance(t, ir.LiteralStructType):
+        field_list = []
+        for i, e in enumerate(t.elements):
+            field_list.append(("field_" + str(i), _convert_llvm_ir_to_dtype(e)))
+
+        ret_t = np.dtype(field_list, align=True)
+    else:
+        assert False, "Don't know how to convert LLVM type to dtype: {}".format(t)
+
+    return ret_t
diff --git a/psyneulink/core/llvm/codegen.py b/psyneulink/core/llvm/codegen.py
index 16eca1c8ddb..df792ce5fe9 100644
--- a/psyneulink/core/llvm/codegen.py
+++ b/psyneulink/core/llvm/codegen.py
@@ -585,9 +585,9 @@ def find_max(builder, x):
         return res
 
 
-def gen_node_wrapper(ctx, composition, node, *, tags:frozenset):
-    assert "node_wrapper" in tags
-    func_tags = tags.difference({"node_wrapper"})
+def gen_node_assembly(ctx, composition, node, *, tags:frozenset):
+    assert "node_assembly" in tags
+    func_tags = tags.difference({"node_assembly"})
 
     node_function = ctx.import_llvm_function(node, tags=func_tags)
     # FIXME: This is a hack
@@ -782,14 +782,14 @@ def _gen_composition_exec_context(ctx, composition, *, tags:frozenset, suffix=""
         params = builder.alloca(const_params.type, name="const_params_loc")
         builder.store(const_params, params)
 
-    node_tags = tags.union({"node_wrapper"})
+    node_tags = tags.union({"node_assembly"})
     # Call input CIM
-    input_cim_w = ctx.get_node_wrapper(composition, composition.input_CIM)
+    input_cim_w = ctx.get_node_assembly(composition, composition.input_CIM)
     input_cim_f = ctx.import_llvm_function(input_cim_w, tags=node_tags)
     builder.call(input_cim_f, [state, params, comp_in, data, data])
 
     # Call parameter CIM
-    param_cim_w = ctx.get_node_wrapper(composition, composition.parameter_CIM)
+    param_cim_w = ctx.get_node_assembly(composition, composition.parameter_CIM)
     param_cim_f = ctx.import_llvm_function(param_cim_w, tags=node_tags)
     builder.call(param_cim_f, [state, params, comp_in, data, data])
 
@@ -803,7 +803,7 @@ def _gen_composition_exec_context(ctx, composition, *, tags:frozenset, suffix=""
 
 def gen_composition_exec(ctx, composition, *, tags:frozenset):
     simulation = "simulation" in tags
-    node_tags = tags.union({"node_wrapper"})
+    node_tags = tags.union({"node_assembly"})
 
     with _gen_composition_exec_context(ctx, composition, tags=tags) as (builder, data, params, cond_gen):
         state, _, comp_in, _, cond = builder.function.args
@@ -823,7 +823,7 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset):
         is_finished_callbacks = {}
         for node in composition.nodes:
             args = [state, params, comp_in, data, output_storage]
-            wrapper = ctx.get_node_wrapper(composition, node)
+            wrapper = ctx.get_node_assembly(composition, node)
             is_finished_callbacks[node] = (wrapper, args)
 
 
@@ -851,14 +851,14 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset):
                                                             num_exec_locs,
                                                             nodes_states)
             with builder.if_then(reinit_cond):
-                node_w = ctx.get_node_wrapper(composition, node)
+                node_w = ctx.get_node_assembly(composition, node)
                 node_reinit_f = ctx.import_llvm_function(node_w, tags=node_tags.union({"reset"}))
                 builder.call(node_reinit_f, [state, params, comp_in, data, data])
 
         # Run controller if it's enabled in 'BEFORE' mode
         if simulation is False and composition.enable_controller and composition.controller_mode == BEFORE:
             assert composition.controller is not None
-            controller_w = ctx.get_node_wrapper(composition, composition.controller)
+            controller_w = ctx.get_node_assembly(composition, composition.controller)
             controller_f = ctx.import_llvm_function(controller_w, tags=node_tags)
             builder.call(controller_f, [state, params, comp_in, data, data])
 
@@ -929,7 +929,7 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset):
             run_set_node_ptr = builder.gep(run_set_ptr, [zero, ctx.int32_ty(idx)])
             node_cond = builder.load(run_set_node_ptr, name="node_" + node.name + "_should_run")
             with builder.if_then(node_cond):
-                node_w = ctx.get_node_wrapper(composition, node)
+                node_w = ctx.get_node_assembly(composition, node)
                 node_f = ctx.import_llvm_function(node_w, tags=node_tags)
                 builder.block.name = "invoke_" + node_f.name
                 # Wrappers do proper indexing of all structures
@@ -984,12 +984,12 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset):
         if simulation is False and composition.enable_controller and \
            composition.controller_mode == AFTER:
             assert composition.controller is not None
-            controller_w = ctx.get_node_wrapper(composition, composition.controller)
+            controller_w = ctx.get_node_assembly(composition, composition.controller)
             controller_f = ctx.import_llvm_function(controller_w, tags=node_tags)
             builder.call(controller_f, [state, params, comp_in, data, data])
 
         # Call output CIM
-        output_cim_w = ctx.get_node_wrapper(composition, composition.output_CIM)
+        output_cim_w = ctx.get_node_assembly(composition, composition.output_CIM)
         output_cim_f = ctx.import_llvm_function(output_cim_w, tags=node_tags)
         builder.block.name = "invoke_" + output_cim_f.name
         builder.call(output_cim_f, [state, params, comp_in, data, data])
@@ -1180,9 +1180,9 @@ def gen_autodiffcomp_exec(ctx, composition, *, tags:frozenset):
         pytorch_func = ctx.import_llvm_function(pytorch_model, tags=tags)
         builder.call(pytorch_func, [state, params, data])
 
-        node_tags = tags.union({"node_wrapper"})
+        node_tags = tags.union({"node_assembly"})
         # Call output CIM
-        output_cim_w = ctx.get_node_wrapper(composition, composition.output_CIM)
+        output_cim_w = ctx.get_node_assembly(composition, composition.output_CIM)
         output_cim_f = ctx.import_llvm_function(output_cim_w, tags=node_tags)
         builder.call(output_cim_f, [state, params, comp_in, data, data])
 
diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 0d05164887d..f90919b97bc 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -48,25 +48,6 @@ def _tupleize(x):
     except TypeError:
         return x if x is not None else tuple()
 
-def _element_dtype(x):
-    """
-    Extract base builtin type from aggregate type.
-
-    Throws assertion failure if the aggregate type includes more than one base type.
-    The assumption is that array of builtin type has the same binary layout as
-    the original aggregate and it's easier to construct
-    """
-    dt = np.dtype(x)
-    while dt.subdtype is not None:
-        dt = dt.subdtype[0]
-
-    if not dt.isbuiltin:
-        fdts = (_element_dtype(f[0]) for f in dt.fields.values())
-        dt = next(fdts)
-        assert all(dt == fdt for fdt in fdts)
-
-    assert dt.isbuiltin, "Element type is not builtin: {} from {}".format(dt, np.dtype(x))
-    return dt
 
 def _pretty_size(size):
     units = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB']
@@ -100,7 +81,9 @@ def _get_compilation_param(self, name, init_method, arg):
             struct = struct_ty(*initializer)
             struct_end = time.time()
 
-            numpy_struct = np.ctypeslib.as_array(struct)
+            # numpy "frombuffer" creates a shared memory view of the provided buffer
+            numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=len(self._execution_contexts))
+
             assert numpy_struct.nbytes == ctypes.sizeof(struct), \
                 "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct))
 
@@ -120,6 +103,8 @@ def _get_compilation_param(self, name, init_method, arg):
 
             if len(self._execution_contexts) == 1:
 
+                numpy_struct.shape = ()
+
                 if name == '_state':
                     self._copy_params_to_pnl(self._execution_contexts[0],
                                              self._obj,
@@ -232,13 +217,16 @@ def _enumerate_recurse(elements):
 
                     pnl_param.set(value, context=context, override=True, compilation_sync=True)
 
+    def _get_indexable(self, np_array):
+        # outputs in recarrays need to be converted to list/tuple to be indexable
+        return np_array.tolist() if np_array.dtype.base.shape == () else np_array
 
 class CUDAExecution(Execution):
-    def __init__(self, buffers=['param_struct', 'state_struct', 'out']):
+    def __init__(self, buffers=['param_struct', 'state_struct']):
         super().__init__()
-        self._gpu_buffers = {}
-        for b in buffers:
-            self._gpu_buffers["_" + b] = None
+
+        # Initialize GPU buffer map
+        self._gpu_buffers = {"_" + b: None for b in buffers}
 
     @property
     def _bin_func_multirun(self):
@@ -253,7 +241,7 @@ def __get_cuda_arg(self, struct_name, arg_handler):
         # .array is a public member of pycuda's In/Out ArgumentHandler classes
         if gpu_buffer is None or gpu_buffer.array is not np_struct:
 
-            # 0-sized structures fail to upload use a dummy numpy array isntead
+            # 0-sized structures fail to upload use a dummy numpy array instead
             gpu_buffer = arg_handler(np_struct if np_struct.nbytes > 0 else np.zeros(2))
 
             self._gpu_buffers[struct_name] = gpu_buffer
@@ -276,54 +264,41 @@ def _cuda_data_struct(self):
     def _cuda_conditions(self):
         return self.__get_cuda_arg("_conditions", jit_engine.pycuda.driver.InOut)
 
-    @property
-    def _cuda_out(self):
-        gpu_buffer = self._gpu_buffers["_out"]
-        if gpu_buffer is None:
-            gpu_buffer = jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(self._ct_vo))
-            self._gpu_buffers["_out"] = gpu_buffer
-
-        return gpu_buffer
-
     def cuda_execute(self, variable):
-        # Create input argument
-        new_var = np.asfarray(variable, dtype=self._vi_dty)
+        # Create input argument, PyCUDA doesn't care about shape
+        new_var = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
         data_in = jit_engine.pycuda.driver.In(new_var)
 
+        extra_dims = (len(self._execution_contexts),) if len(self._execution_contexts) > 1 else ()
+        data_out = self._bin_func.np_buffer_for_arg(3, extra_dimensions=extra_dims)
+
         self._bin_func.cuda_call(self._cuda_param_struct,
                                  self._cuda_state_struct,
                                  data_in,
-                                 self._cuda_out,
+                                 jit_engine.pycuda.driver.Out(data_out),
                                  threads=len(self._execution_contexts))
 
-        return _convert_ctype_to_python(self._ct_vo)
+        return self._get_indexable(data_out)
 
 
 class FuncExecution(CUDAExecution):
 
     def __init__(self, component, execution_ids=[None], *, tags=frozenset()):
         super().__init__()
-        self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags)
+
+        self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags, numpy_args=(0, 1, 2, 3))
         self._execution_contexts = [
             Context(execution_id=eid) for eid in execution_ids
         ]
         self._component = component
 
-        _, _, vi_ty, vo_ty = self._bin_func.byref_arg_types
 
         if len(execution_ids) > 1:
             self._bin_multirun = self._bin_func.get_multi_run()
             self._ct_len = ctypes.c_int(len(execution_ids))
-            vo_ty = vo_ty * len(execution_ids)
-            vi_ty = vi_ty * len(execution_ids)
 
-        self._ct_vo = vo_ty()
-        self._vi_dty = _element_dtype(vi_ty)
-        if "stat" in self._debug_env:
-            print("Input struct size:", _pretty_size(ctypes.sizeof(vi_ty)),
-                  "for", self._component.name)
-            print("Output struct size:", _pretty_size(ctypes.sizeof(vo_ty)),
-                  "for", self._component.name)
+            vo_ty = self._bin_func.byref_arg_types[3] * len(execution_ids)
+            self._ct_vo = vo_ty()
 
     @property
     def _obj(self):
@@ -338,36 +313,29 @@ def _state_struct(self):
         return self._get_compilation_param('_state', '_get_state_initializer', 1)
 
     def execute(self, variable):
-        # Make sure function inputs are 2d.
-        # Mechanism inputs are already 3d so the first part is nop.
-        new_variable = np.asfarray(np.atleast_2d(variable),
-                                   dtype=self._vi_dty)
+        new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
 
-        ct_vi = np.ctypeslib.as_ctypes(new_variable)
         if len(self._execution_contexts) > 1:
-            # wrap_call casts the arguments so we only need contiguous data
-            # layout
+            # wrap_call casts the arguments so we only need contiguous data layout
+            ct_vi = np.ctypeslib.as_ctypes(new_variable)
+
             self._bin_multirun.wrap_call(self._param_struct[0],
                                          self._state_struct[0],
                                          ct_vi,
                                          self._ct_vo,
                                          self._ct_len)
+            return _convert_ctype_to_python(self._ct_vo)
         else:
-            self._bin_func(self._param_struct[0], self._state_struct[0], ct_vi, self._ct_vo)
+            data_out = self._bin_func.np_buffer_for_arg(3)
+            data_in = new_variable.reshape(self._bin_func.np_params[2].shape)
 
-        return _convert_ctype_to_python(self._ct_vo)
+            self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out)
 
+        return self._get_indexable(data_out)
 
-class MechExecution(FuncExecution):
 
-    def execute(self, variable):
-        # Convert to 3d. We always assume that:
-        #   a) the input is vector of input ports
-        #   b) input ports take vector of projection outputs
-        #   c) projection output is a vector (even 1 element vector)
-        new_var = np.atleast_3d(variable)
-        new_var.shape = (len(self._component.input_ports), 1, -1)
-        return super().execute(new_var)
+class MechExecution(FuncExecution):
+    pass
 
 
 class CompExecution(CUDAExecution):
@@ -385,10 +353,11 @@ def __init__(self, composition, execution_ids=[None], *, additional_tags=frozens
         self.__bin_func = None
         self.__bin_run_func = None
         self.__bin_run_multi_func = None
-        self.__frozen_vals = None
+        self.__frozen_values = None
         self.__tags = frozenset(additional_tags)
 
-        self.__conds = None
+        # Scheduling conditions, only used by "execute"
+        self.__conditions = None
 
         if len(execution_ids) > 1:
             self._ct_len = ctypes.c_int(len(execution_ids))
@@ -440,29 +409,37 @@ def _bin_func_multirun(self):
 
     def _set_bin_node(self, node):
         assert node in self._composition._all_nodes
-        wrapper = builder_context.LLVMBuilderContext.get_current().get_node_wrapper(self._composition, node)
-        self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj(
-            wrapper, tags=self.__tags.union({"node_wrapper"}))
+        node_assembly = builder_context.LLVMBuilderContext.get_current().get_node_assembly(self._composition, node)
+        self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj(node_assembly,
+                                                            tags=self.__tags.union({"node_assembly"}),
+                                                            numpy_args=(0, 1, 2, 3, 4))
 
     @property
     def _conditions(self):
-        if self.__conds is None:
+        if self.__conditions is None:
             gen = helpers.ConditionGenerator(None, self._composition)
+
             if len(self._execution_contexts) > 1:
-                cond_ctype = self._bin_func_multirun.byref_arg_types[4] * len(self._execution_contexts)
-                cond_initializer = (gen.get_condition_initializer() for _ in self._execution_contexts)
+                conditions_ctype = self._bin_func_multirun.byref_arg_types[4] * len(self._execution_contexts)
+                conditions_initializer = (gen.get_condition_initializer() for _ in self._execution_contexts)
             else:
-                cond_ctype = self._bin_func.byref_arg_types[4]
-                cond_initializer = gen.get_condition_initializer()
+                conditions_ctype = self._bin_func.byref_arg_types[4]
+                conditions_initializer = gen.get_condition_initializer()
+
+            ct_conditions = conditions_ctype(*conditions_initializer)
+            np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=len(self._execution_contexts))
+
+            if len(self._execution_contexts) == 1:
+                np_conditions.shape = ()
+
+            self.__conditions = (ct_conditions, np_conditions)
 
-            c_conds = cond_ctype(*cond_initializer)
-            self.__conds = (c_conds, np.ctypeslib.as_array(c_conds))
             if "stat" in self._debug_env:
                 print("Instantiated condition struct ( size:" ,
-                      _pretty_size(ctypes.sizeof(cond_ctype)), ")",
+                      _pretty_size(ctypes.sizeof(conditions_ctype)), ")",
                       "for", self._composition.name)
 
-        return self.__conds
+        return self.__conditions
 
     @property
     def _param_struct(self):
@@ -482,8 +459,8 @@ def _data_struct(self):
     def _data_struct(self, data_struct):
         self._data = data_struct
 
-    def _extract_node_struct(self, node, data):
-        # context structure consists of a list of node contexts,
+    def _extract_node_struct_from_ctype(self, node, data):
+        # state structure consists of a list of node states,
         #   followed by a list of projection contexts; get the first one
         # parameter structure consists of a list of node parameters,
         #   followed by a list of projection parameters; get the first one
@@ -499,60 +476,90 @@ def _extract_node_struct(self, node, data):
 
         return _convert_ctype_to_python(res_struct)
 
+    def _extract_node_struct_from_numpy(self, node, data):
+        # state structure consists of a list of node states,
+        #   followed by a list of projection contexts; get the first one
+        # parameter structure consists of a list of node parameters,
+        #   followed by a list of projection parameters; get the first one
+        # output structure consists of a list of node outputs,
+        #   followed by a list of nested data structures; get the first one
+        all_nodes = data[data.dtype.names[0]]
+
+        # Get the index into the array of all nodes
+        index = self._composition._get_node_index(node)
+        node_struct = all_nodes[all_nodes.dtype.names[index]]
+
+        # Return copies of the extracted functions to avoid corrupting the
+        # returned results in next execution
+        return node_struct.copy().tolist() if node_struct.shape == () else node_struct.copy()
+
     def extract_node_struct(self, node, struct):
         if len(self._execution_contexts) > 1:
-            return [self._extract_node_struct(node, struct[i]) for i, _ in enumerate(self._execution_contexts)]
+            return [self._extract_node_struct_from_ctype(node, struct[0][i]) for i, _ in enumerate(self._execution_contexts)]
         else:
-            return self._extract_node_struct(node, struct)
+            return self._extract_node_struct_from_numpy(node, struct[1])
 
     def extract_frozen_node_output(self, node):
-        return self.extract_node_struct(node, self.__frozen_vals)
+        return self.extract_node_struct(node, self.__frozen_values)
 
     def extract_node_output(self, node):
-        return self.extract_node_struct(node, self._data_struct[0])
+        return self.extract_node_struct(node, self._data_struct)
 
     def extract_node_state(self, node):
-        return self.extract_node_struct(node, self._state_struct[0])
+        return self.extract_node_struct(node, self._state_struct)
 
     def extract_node_params(self, node):
-        return self.extract_node_struct(node, self._param_struct[0])
+        return self.extract_node_struct(node, self._param_struct)
 
     def insert_node_output(self, node, data):
-        my_field_name = self._data_struct[0]._fields_[0][0]
-        my_res_struct = getattr(self._data_struct[0], my_field_name)
+        # output structure consists of a list of node outputs,
+        #   followed by a list of nested data structures; get the first one
+        all_nodes = self._data_struct[1][self._data_struct[1].dtype.names[0]]
+
+        # Get the index into the array of all nodes
         index = self._composition._get_node_index(node)
-        node_field_name = my_res_struct._fields_[index][0]
-        setattr(my_res_struct, node_field_name, _tupleize(data))
+        value = all_nodes[all_nodes.dtype.names[index]]
+        np.copyto(value, np.asarray(data, dtype=value.dtype))
 
     def _get_input_struct(self, inputs):
         # Either node or composition execute.
-        # All execute functions expect inputs to be 3rd param.
-        c_input_type = self._bin_func.byref_arg_types[2]
 
         # Read provided input data and parse into an array (generator)
         if len(self._execution_contexts) > 1:
             assert len(self._execution_contexts) == len(inputs)
-            c_input_type = c_input_type * len(self._execution_contexts)
+
+            # All execute functions expect inputs to be 3rd param.
+            ct_input_type = self._bin_func.byref_arg_types[2] * len(self._execution_contexts)
+
             input_data = (([x] for x in self._composition._build_variable_for_input_CIM(inp)) for inp in inputs)
+
+            ct_input = ct_input_type(*_tupleize(input_data))
+            np_input = np.ctypeslib.as_array(ct_input)
         else:
-            input_data = ([x] for x in self._composition._build_variable_for_input_CIM(inputs))
+            ct_input = None
+            data = self._composition._build_variable_for_input_CIM(inputs)
+
+            np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base)
+            np_input = np_input.reshape(self._bin_func.np_params[2].shape)
 
         if "stat" in self._debug_env:
-            print("Input struct size:", _pretty_size(ctypes.sizeof(c_input_type)),
-                  "for", self._composition.name)
-        c_input = c_input_type(*_tupleize(input_data))
-        return c_input, np.ctypeslib.as_array(c_input)
+            print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name)
+
+        return ct_input, np_input
 
     def freeze_values(self):
-        self.__frozen_vals = copy.deepcopy(self._data_struct[0])
+        np_copy = self._data_struct[1].copy()
 
-    def execute_node(self, node, inputs=None, context=None):
+        self.__frozen_values = (None, np_copy)
+
+    def execute_node(self, node, inputs=None):
         # We need to reconstruct the input dictionary here if it was not provided.
         # This happens during node execution of nested compositions.
         assert len(self._execution_contexts) == 1
+        context = self._execution_contexts[0]
+
         if inputs is None and node is self._composition.input_CIM:
-            if context is None:
-                context = self._execution_contexts[0]
+
             port_inputs = {origin_port:[proj.parameters.value._get(context) for proj in p[0].path_afferents] for (origin_port, p) in self._composition.input_CIM_ports.items()}
             inputs = {}
             for p, v in port_inputs.items():
@@ -560,23 +567,33 @@ def execute_node(self, node, inputs=None, context=None):
                 index = p.owner.input_ports.index(p)
                 data[index] = v[0]
 
+        assert inputs is not None or node is not self._composition.input_CIM
 
         # Set bin node to make sure self._*struct works as expected
         self._set_bin_node(node)
-        if inputs is not None:
-            inputs = self._get_input_struct(inputs)[0]
 
-        assert inputs is not None or node is not self._composition.input_CIM
+        # Numpy doesn't allow to pass NULL to the called function.
+        # Create and pass a dummy buffer filled with NaN instead.
+        if inputs is not None:
+            inputs = self._get_input_struct(inputs)[1]
+        else:
+            inputs = self._bin_func.np_buffer_for_arg(2)
 
-        # Freeze output values if this is the first time we need them
-        if node is not self._composition.input_CIM and self.__frozen_vals is None:
-            self.freeze_values()
+        # Nodes other than input_CIM/parameter_CIM take inputs from projections
+        # and need frozen values available
+        if node is not self._composition.input_CIM and node is not self._composition.parameter_CIM:
+            assert self.__frozen_values is not None
+            data_in = self.__frozen_values[1]
+        else:
+            # The ndarray argument check doesn't allow None for null so just provide
+            # the same structure as outputs.
+            data_in = self._data_struct[1]
 
-        self._bin_func(self._state_struct[0],
-                       self._param_struct[0],
+        self._bin_func(self._state_struct[1],
+                       self._param_struct[1],
                        inputs,
-                       self.__frozen_vals,
-                       self._data_struct[0])
+                       data_in,
+                       self._data_struct[1])
 
         if "comp_node_debug" in self._debug_env:
             print("RAN: {}. State: {}".format(node, self.extract_node_state(node)))
@@ -589,7 +606,7 @@ def execute_node(self, node, inputs=None, context=None):
     def _bin_exec_func(self):
         if self.__bin_exec_func is None:
             self.__bin_exec_func = pnlvm.LLVMBinaryFunction.from_obj(
-                self._composition, tags=self.__tags)
+                self._composition, tags=self.__tags, numpy_args=(0, 1, 2, 3, 4))
 
         return self.__bin_exec_func
 
@@ -611,11 +628,11 @@ def execute(self, inputs):
                                                 self._conditions[0],
                                                 self._ct_len)
         else:
-            self._bin_exec_func(self._state_struct[0],
-                                self._param_struct[0],
-                                self._get_input_struct(inputs)[0],
-                                self._data_struct[0],
-                                self._conditions[0])
+            self._bin_exec_func(self._state_struct[1],
+                                self._param_struct[1],
+                                self._get_input_struct(inputs)[1],
+                                self._data_struct[1],
+                                self._conditions[1])
 
     def cuda_execute(self, inputs):
         # NOTE: Make sure that input struct generation is inlined.
@@ -664,7 +681,7 @@ def _get_generator_run_input_struct(self, inputs, runs):
     def _bin_run_func(self):
         if self.__bin_run_func is None:
             self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj(
-                self._composition, tags=self.__tags.union({"run"}))
+                self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2))
 
         return self.__bin_run_func
 
@@ -712,9 +729,9 @@ def run(self, inputs, runs=0, num_input_sets=0):
             # This is only needed for non-generator inputs that are wrapped in an extra context dimension
             inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3])
 
-            self._bin_run_func(self._state_struct[0],
-                               self._param_struct[0],
-                               self._data_struct[0],
+            self._bin_run_func(self._state_struct[1],
+                               self._param_struct[1],
+                               self._data_struct[1],
                                inputs,
                                outputs,
                                runs_count,
@@ -766,7 +783,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
 
         eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective"
         tags = {"evaluate", "alloc_range", eval_type}
-        bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags))
+        bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6))
         self.__bin_func = bin_func
 
         # There are 8 arguments to evaluate_alloc_range:
@@ -776,9 +793,9 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
 
         # Directly initialized structures
         assert ocm.agent_rep is self._composition
-        comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0)
-        comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)
-        comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)
+        comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0)[1]
+        comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)[1]
+        comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)[1]
 
         # Construct input variable, the 5th parameter of the evaluate function
         ct_inputs = self._get_run_input_struct(inputs, num_input_sets, 5)
@@ -799,7 +816,6 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
                   "( evaluations:", num_evaluations, "element size:", ctypes.sizeof(out_el_ty), ")",
                   "for", self._obj.name)
 
-        # return variable as numpy array. pycuda can use it directly
         return comp_params, comp_state, comp_data, ct_inputs, out_ty, ct_num_inputs
 
     def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False):
@@ -808,11 +824,11 @@ def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:boo
 
         ct_results = out_ty()
 
-        cuda_args = (jit_engine.pycuda.driver.In(comp_params[1]),
-                     jit_engine.pycuda.driver.InOut(comp_state[1]),
+        cuda_args = (jit_engine.pycuda.driver.In(comp_params),
+                     jit_engine.pycuda.driver.InOut(comp_state),
                      jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_results)),   # results
                      jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)),     # inputs
-                     jit_engine.pycuda.driver.InOut(comp_data[1]),                      # composition data
+                     jit_engine.pycuda.driver.InOut(comp_data),                         # composition data
                      jit_engine.pycuda.driver.In(np.int32(num_input_sets)),             # number of inputs
                     )
 
@@ -833,19 +849,19 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b
 
             # Create input and result typed casts once, they are the same
             # for every submitted job.
-            input_param = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5])
-            results_param = ctypes.cast(ct_results, self.__bin_func.c_func.argtypes[4])
+            input_arg = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5])
+            results_arg = ctypes.cast(ct_results, self.__bin_func.c_func.argtypes[4])
 
             # There are 7 arguments to evaluate_alloc_range:
             # comp_param, comp_state, from, to, results, input, comp_data
             results = [ex.submit(self.__bin_func,
-                                 comp_params[0],
-                                 comp_state[0],
+                                 comp_params,
+                                 comp_state,
                                  int(i * evals_per_job),
                                  min((i + 1) * evals_per_job, num_evaluations),
-                                 results_param,
-                                 input_param,
-                                 comp_data[0],
+                                 results_arg,
+                                 input_arg,
+                                 comp_data,
                                  ct_num_inputs)
                        for i in range(jobs)]
 
diff --git a/psyneulink/core/llvm/helpers.py b/psyneulink/core/llvm/helpers.py
index a7464fd7664..2eae0e69974 100644
--- a/psyneulink/core/llvm/helpers.py
+++ b/psyneulink/core/llvm/helpers.py
@@ -706,7 +706,7 @@ def generate_sched_condition(self, builder, condition, cond_ptr, node,
             # The first argument is the target node
             assert len(condition.args) == 1
             target = is_finished_callbacks[condition.args[0]]
-            is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_wrapper"}))
+            is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_assembly"}))
             return builder.call(is_finished_f, target[1])
 
         elif isinstance(condition, WhenFinishedAny):
@@ -715,7 +715,7 @@ def generate_sched_condition(self, builder, condition, cond_ptr, node,
             run_cond = self.ctx.bool_ty(0)
             for node in condition.args:
                 target = is_finished_callbacks[node]
-                is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_wrapper"}))
+                is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_assembly"}))
                 node_is_finished = builder.call(is_finished_f, target[1])
 
                 run_cond = builder.or_(run_cond, node_is_finished)
@@ -728,7 +728,7 @@ def generate_sched_condition(self, builder, condition, cond_ptr, node,
             run_cond = self.ctx.bool_ty(1)
             for node in condition.args:
                 target = is_finished_callbacks[node]
-                is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_wrapper"}))
+                is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_assembly"}))
                 node_is_finished = builder.call(is_finished_f, target[1])
 
                 run_cond = builder.and_(run_cond, node_is_finished)
diff --git a/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py b/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py
index 4b197406217..48e5292a9d7 100644
--- a/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py
+++ b/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py
@@ -392,7 +392,7 @@ class Parameters(RecurrentTransferMechanism.Parameters):
 
         matrix = Parameter(
             INVERSE_HOLLOW_MATRIX,
-            modulable=True,
+            modulable=False,
             getter=_recurrent_transfer_mechanism_matrix_getter,
             setter=_recurrent_transfer_mechanism_matrix_setter
         )
@@ -402,9 +402,9 @@ class Parameters(RecurrentTransferMechanism.Parameters):
             function_parameter_name='rate',
             aliases='leak'
         )
-        auto = Parameter(0.0, modulable=True, aliases='self_excitation')
-        hetero = Parameter(-1.0, modulable=True)
-        competition = Parameter(1.0, modulable=True)
+        auto = Parameter(0.0, modulable=False, aliases='self_excitation')
+        hetero = Parameter(-1.0, modulable=False)
+        competition = Parameter(1.0, modulable=False)
         time_step_size = FunctionParameter(0.1, function_name='integrator_function')
 
         integrator_mode = Parameter(True, setter=_integrator_mode_setter, valid_types=bool)
diff --git a/requirements.txt b/requirements.txt
index dc485bfd0e2..8966c89ce8d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,6 +17,6 @@ pillow<10.5.0
 pint<0.22.0
 protobuf<3.20.4
 rich>=10.1, <10.13
-scipy<1.12
+scipy>=1.7.3, <1.15
 toposort<1.11
 torch>=1.10.0, <2.4.0; (platform_machine == 'AMD64' or platform_machine == 'x86_64' or platform_machine == 'arm64' or platform_machine == 'aarch64') and platform_python_implementation == 'CPython' and implementation_name == 'cpython'
diff --git a/setup.cfg b/setup.cfg
index ffc15d5cfb9..911094866c0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -71,6 +71,8 @@ filterwarnings =
 	error:Creating an ndarray from ragged nested sequences \(which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes\) is deprecated.*:numpy.VisibleDeprecationWarning
 	error:Invalid escape sequence
 	error:the matrix subclass is not the recommended way to represent matrices or deal with linear algebra
+	error:Passing (type, 1) or '1type' as a synonym of type is deprecated
+	error:A builtin ctypes object gave a PEP3118:RuntimeWarning
 
 [pycodestyle]
 # for code explanation see https://pep8.readthedocs.io/en/latest/intro.html#error-codes
diff --git a/tests/composition/test_control.py b/tests/composition/test_control.py
index d390a7274f9..04a512b15ee 100644
--- a/tests/composition/test_control.py
+++ b/tests/composition/test_control.py
@@ -2711,7 +2711,8 @@ def test_modulation_of_random_state(self, comp_mode, num_generators):
 @pytest.mark.composition
 @pytest.mark.control
 class TestModelBasedOptimizationControlMechanisms_Execution:
-    def test_ocm_default_function(self):
+    @pytest.mark.parametrize("mode, ocm_mode", pytest.helpers.get_comp_and_ocm_execution_modes())
+    def test_ocm_default_function(self, ocm_mode, mode):
         a = pnl.ProcessingMechanism()
         comp = pnl.Composition(
             controller_mode=pnl.BEFORE,
@@ -2729,26 +2730,19 @@ def test_ocm_default_function(self):
                 ),
             )
         )
+        comp.controller.comp_execution_mode = ocm_mode
+
         assert type(comp.controller.function) == pnl.GridSearch
-        assert comp.run([1]) == [10]
+
+        res = comp.run([1], execution_mode=mode)
+        np.testing.assert_array_equal(res, [[10]])
 
     @pytest.mark.parametrize("nested", [True, False])
-    @pytest.mark.parametrize("format", ["list", "tuple", "SampleIterator", "SampleIteratorArray", "SampleSpec", "ndArray"])
+    @pytest.mark.parametrize("search_space",
+                             [[1, 10], (1, 10), SampleIterator((1, 10)), SampleIterator([1, 10]), SampleSpec(1, 10, 9), np.array((1, 10))],
+                             ids=["list", "tuple", "SampleIterator", "SampleIteratorArray", "SampleSpec", "ndArray"])
     @pytest.mark.parametrize("mode, ocm_mode", pytest.helpers.get_comp_and_ocm_execution_modes())
-    def test_ocm_searchspace_format_equivalence(self, format, nested, mode, ocm_mode):
-
-        if format == "list":
-            search_space = [1, 10]
-        elif format == "tuple":
-            search_space = (1, 10)
-        elif format == "SampleIterator":
-            search_space = SampleIterator((1, 10))
-        elif format == "SampleIteratorArray":
-            search_space = SampleIterator([1, 10])
-        elif format == "SampleSpec":
-            search_space = SampleSpec(1, 10, 9)
-        elif format == "ndArray":
-            search_space = np.array((1, 10))
+    def test_ocm_searchspace_format_equivalence(self, search_space, nested, mode, ocm_mode):
 
         if nested:
             search_space = [search_space]
@@ -2772,7 +2766,9 @@ def test_ocm_searchspace_format_equivalence(self, format, nested, mode, ocm_mode
         comp.controller.comp_execution_mode = ocm_mode
 
         assert type(comp.controller.function) == pnl.GridSearch
-        assert comp.run([1], execution_mode=mode) == [[10]]
+
+        res = comp.run([1], execution_mode=mode)
+        np.testing.assert_array_equal(res, [[10]])
 
     def test_evc(self):
         # Mechanisms
diff --git a/tests/composition/test_parameterestimationcomposition.py b/tests/composition/test_parameterestimationcomposition.py
index bf3a8c3138b..12c39a64a3a 100644
--- a/tests/composition/test_parameterestimationcomposition.py
+++ b/tests/composition/test_parameterestimationcomposition.py
@@ -1,7 +1,10 @@
 import numpy as np
+import optuna
 import pandas as pd
 import pytest
-import optuna
+import scipy
+
+from packaging import version as pversion
 
 import psyneulink as pnl
 
@@ -125,17 +128,32 @@ def test_pec_run_input_formats(inputs_dict, error_msg):
         pec.run(inputs=inputs_dict)
 
 
+# SciPy changed their implementation of differential evolution and the way it selects
+# samples to evaluate in 1.12 [0,1], and then again in 1.14 [2,3], leading to slightly
+# different results
+#
+# [0] https://docs.scipy.org/doc/scipy/release/1.12.0-notes.html#scipy-optimize-improvements
+# [1] https://github.com/scipy/scipy/pull/18496
+# [2] https://docs.scipy.org/doc/scipy/release/1.14.0-notes.html#scipy-optimize-improvements
+# [3] https://github.com/scipy/scipy/pull/20677
+if pversion.parse(scipy.version.version) >= pversion.parse('1.14.0'):
+    expected_differential_evolution = [0.010113000942356953]
+elif pversion.parse(scipy.version.version) >= pversion.parse('1.12.0'):
+    expected_differential_evolution = [0.010074123395259815]
+else:
+    expected_differential_evolution = [0.010363518438648106]
+
 @pytest.mark.composition
 @pytest.mark.parametrize(
-    "opt_method, result",
+    "opt_method, expected_result",
     [
-        ("differential_evolution", [0.010363518438648106]),
+        ("differential_evolution", expected_differential_evolution),
         (optuna.samplers.RandomSampler(seed=0), [0.01]),
         (optuna.samplers.CmaEsSampler(seed=0), [0.01]),
     ],
-    ids=["differential_evolultion", "optuna_random_sampler", "optuna_cmaes_sampler"],
+    ids=["differential_evolution", "optuna_random_sampler", "optuna_cmaes_sampler"],
 )
-def test_parameter_optimization_ddm(func_mode, opt_method, result):
+def test_parameter_optimization_ddm(func_mode, opt_method, expected_result):
     """Test parameter optimization of a DDM in integrator mode"""
 
     if func_mode == "Python":
@@ -210,11 +228,9 @@ def reward_rate(sim_data):
     trial_inputs[0] = np.abs(trial_inputs[0])
     trial_inputs[-1] = np.abs(trial_inputs[-1])
 
-    inputs_dict = {decision: trial_inputs}
-
-    ret = pec.run(inputs={comp: trial_inputs})
+    pec.run(inputs={comp: trial_inputs})
 
-    np.testing.assert_allclose(pec.optimized_parameter_values, result)
+    np.testing.assert_allclose(pec.optimized_parameter_values, expected_result)
 
 
 # func_mode is a hacky wa to get properly marked; Python, LLVM, and CUDA
diff --git a/tests/llvm/test_builtins_intrinsics.py b/tests/llvm/test_builtins_intrinsics.py
index fae99faa520..22cc3d2df8d 100644
--- a/tests/llvm/test_builtins_intrinsics.py
+++ b/tests/llvm/test_builtins_intrinsics.py
@@ -32,8 +32,10 @@
 def test_builtin_op(benchmark, op, args, builtin, result, func_mode):
     if func_mode == 'Python':
         f = op
+
     elif func_mode == 'LLVM':
         f = pnlvm.LLVMBinaryFunction.get(builtin)
+
     elif func_mode == 'PTX':
         wrap_name = builtin + "_test_wrapper"
         with pnlvm.LLVMBuilderContext.get_current() as ctx:
@@ -47,12 +49,18 @@ def test_builtin_op(benchmark, op, args, builtin, result, func_mode):
             builder.ret_void()
 
         bin_f = pnlvm.LLVMBinaryFunction.get(wrap_name)
-        dty = np.dtype(bin_f.byref_arg_types[0])
+
+        # The result argument is a pointer, use it to derive
+        # the right argument type
+        dty = bin_f.np_params[1].base
+
         ptx_res = np.empty_like(result, dtype=dty)
         ptx_res_arg = pnlvm.jit_engine.pycuda.driver.Out(ptx_res)
+
         def f(*a):
             bin_f.cuda_call(*(dty.type(p) for p in a), ptx_res_arg)
             return ptx_res
+
     res = benchmark(f, *args)
 
     if pytest.helpers.llvm_current_fp_precision() == 'fp32':
diff --git a/tests/llvm/test_builtins_matrix.py b/tests/llvm/test_builtins_matrix.py
index e338f80bed3..1cad00e1565 100644
--- a/tests/llvm/test_builtins_matrix.py
+++ b/tests/llvm/test_builtins_matrix.py
@@ -39,9 +39,6 @@ def _get_const_dim_func(builtin, *dims):
         builtin = ctx.import_llvm_function(builtin)
         pointer_arg_types = [a for a in builtin.type.pointee.args if pnlvm.helpers.is_pointer(a)]
 
-        func_ty = ir.FunctionType(ir.VoidType(), pointer_arg_types)
-
-
         # Create square vector matrix multiply
         function = ir.Function(ctx.module, builtin.type.pointee, name=custom_name)
         const_dims = (ctx.int32_ty(d) for d in dims)
@@ -65,6 +62,14 @@ def _get_const_dim_func(builtin, *dims):
                          ], ids=["ADD", "SUB", "MUL", "ADDS", "MULS", "DOT", "TRANS DOT"])
 @pytest.mark.parametrize("dims", [(DIM_X, DIM_Y), (0, 0)], ids=["VAR-DIM", "CONST-DIM"])
 def test_matrix_op(benchmark, op, x, y, builtin, result, func_mode, dims):
+
+    def _numpy_args(bin_f):
+        np_x = x.astype(bin_f.np_params[0])
+        np_y = bin_f.np_params[1].type(y) if np.isscalar(y) else y.astype(bin_f.np_params[1])
+        np_res = np.empty_like(result, dtype=bin_f.np_params[-1])
+
+        return np_x, np_y, np_res
+
     if func_mode == 'Python':
         def ex():
             return op(x, y)
@@ -76,13 +81,7 @@ def ex():
             func_name = builtin
 
         bin_f = pnlvm.LLVMBinaryFunction.get(func_name)
-        dty = np.dtype(bin_f.byref_arg_types[0])
-        assert dty == np.dtype(bin_f.byref_arg_types[1])
-        assert dty == np.dtype(bin_f.byref_arg_types[4])
-
-        lx = x.astype(dty)
-        ly = dty.type(y) if np.isscalar(y) else y.astype(dty)
-        lres = np.empty_like(result, dtype=dty)
+        lx, ly, lres = _numpy_args(bin_f)
 
         ct_x = lx.ctypes.data_as(bin_f.c_func.argtypes[0])
         ct_y = ly if np.isscalar(ly) else ly.ctypes.data_as(bin_f.c_func.argtypes[1])
@@ -99,17 +98,12 @@ def ex():
             func_name = builtin
 
         bin_f = pnlvm.LLVMBinaryFunction.get(func_name)
-        dty = np.dtype(bin_f.byref_arg_types[0])
-        assert dty == np.dtype(bin_f.byref_arg_types[1])
-        assert dty == np.dtype(bin_f.byref_arg_types[4])
-
-        lx = x.astype(dty)
-        ly = dty.type(y) if np.isscalar(y) else y.astype(dty)
-        lres = np.empty_like(result, dtype=dty)
+        lx, ly, lres = _numpy_args(bin_f)
 
         cuda_x = pnlvm.jit_engine.pycuda.driver.In(lx)
         cuda_y = ly if np.isscalar(ly) else pnlvm.jit_engine.pycuda.driver.In(ly)
         cuda_res = pnlvm.jit_engine.pycuda.driver.Out(lres)
+
         def ex():
             bin_f.cuda_call(cuda_x, cuda_y, np.int32(dims[0]), np.int32(dims[1]), cuda_res)
             return lres
diff --git a/tests/llvm/test_builtins_mt_random.py b/tests/llvm/test_builtins_mt_random.py
index 4d25e53c8cd..2ff7cff0ea2 100644
--- a/tests/llvm/test_builtins_mt_random.py
+++ b/tests/llvm/test_builtins_mt_random.py
@@ -15,35 +15,46 @@ def test_random_int(benchmark, mode):
     res = []
     if mode == 'Python':
         state = random.Random(SEED)
+
         def f():
             return state.randrange(0xffffffff)
+
     elif mode == 'numpy':
         # Numpy promotes elements to int64
         state = np.random.RandomState([SEED])
+
         def f():
             return state.randint(0xffffffff, dtype=np.int64)
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
+
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32')
-        out = ctypes.c_ulonglong()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return out.value
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
+
         init_fun.cuda_call(gpu_state, np.int32(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32')
-        out = np.asarray([0], dtype=np.uint64)
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -61,35 +72,45 @@ def test_random_float(benchmark, mode):
     if mode == 'Python':
         # Python treats every seed as array
         state = random.Random(SEED)
+
         def f():
             return state.random()
+
     elif mode == 'numpy':
         # numpy promotes elements to int64
         state = np.random.RandomState([SEED])
+
         def f():
             return state.random_sample()
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double')
-        out = gen_fun.byref_arg_types[1]()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return out.value
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
+
         init_fun.cuda_call(gpu_state, np.int32(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double')
-        out = np.asfarray([0.0], dtype=np.dtype(gen_fun.byref_arg_types[1]))
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -107,30 +128,38 @@ def test_random_normal(benchmark, mode):
     if mode == 'numpy':
         # numpy promotes elements to int64
         state = np.random.RandomState([SEED])
+
         def f():
             return state.normal()
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal')
-        out = gen_fun.byref_arg_types[1]()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return out.value
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
+
         init_fun.cuda_call(gpu_state, np.int32(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal')
-        out = np.asfarray([0.0], dtype=np.dtype(gen_fun.byref_arg_types[1]))
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -157,35 +186,44 @@ def test_random_binomial(benchmark, mode, n, p, exp):
     if mode == 'numpy':
         # numpy promotes elements to int64
         state = np.random.RandomState([SEED])
+
         def f():
             return state.binomial(n, p)
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial')
-        c_n = gen_fun.byref_arg_types[1](n)
-        c_p = gen_fun.byref_arg_types[2](p)
-        c_out = gen_fun.byref_arg_types[-1]()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial', numpy_args=(0, 1, 2, 3))
+        n = np.asarray(n, dtype=gen_fun.np_params[1])
+        p = np.asarray(p, dtype=gen_fun.np_params[2])
+
         def f():
-            gen_fun(state, c_n, c_p, c_out)
-            return c_out.value
+            out = gen_fun.np_buffer_for_arg(1)
+            gen_fun(state, n, p, out)
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
+
         init_fun.cuda_call(gpu_state, np.int32(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial')
-        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.array([n], dtype=np.dtype(gen_fun.byref_arg_types[1])))
-        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.array([p], dtype=np.dtype(gen_fun.byref_arg_types[2])))
-        out = np.array([0.0], dtype=np.dtype(gen_fun.byref_arg_types[3]))
+
+        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1]))
+        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2]))
+
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
 
         def f():
             gen_fun.cuda_call(gpu_state, gpu_n, gpu_p, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
diff --git a/tests/llvm/test_builtins_philox_random.py b/tests/llvm/test_builtins_philox_random.py
index 40fc1abc09a..0c6e289a700 100644
--- a/tests/llvm/test_builtins_philox_random.py
+++ b/tests/llvm/test_builtins_philox_random.py
@@ -26,27 +26,32 @@ def f():
             return prng.integers(0xffffffffffffffff, dtype=np.uint64, endpoint=True)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, seed)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64')
-        out = ctypes.c_ulonglong()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return np.uint64(out.value)
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
+
         init_fun.cuda_call(gpu_state, np.int64(seed))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64')
-        out = np.asarray([0], dtype=np.uint64)
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -64,33 +69,38 @@ def test_random_int32(benchmark, mode):
     res = []
     if mode == 'numpy':
         state = np.random.Philox([SEED])
-        prng = np.random.Generator(state)
+        prng = np.random.Generator(state)\
+
         def f():
             # Get uint range [0, MAX] to avoid any intermediate caching of random bits
             return prng.integers(0xffffffff, dtype=np.uint32, endpoint=True)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32')
-        out = ctypes.c_uint()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return out.value
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
         init_fun.cuda_call(gpu_state, np.int64(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32')
-        out = np.asarray([0], dtype=np.uint32)
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -109,30 +119,36 @@ def test_random_double(benchmark, mode):
     if mode == 'numpy':
         state = np.random.Philox([SEED])
         prng = np.random.Generator(state)
+
         def f():
             return prng.random(dtype=np.float64)
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double')
-        out = ctypes.c_double()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return out.value
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
         init_fun.cuda_call(gpu_state, np.int64(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double')
-        out = np.asfarray([0.0], dtype=np.float64)
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -150,30 +166,36 @@ def test_random_float(benchmark, mode):
     if mode == 'numpy':
         state = np.random.Philox([SEED])
         prng = np.random.Generator(state)
+
         def f():
             return prng.random(dtype=np.float32)
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float')
-        out = ctypes.c_float()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return out.value
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
         init_fun.cuda_call(gpu_state, np.int64(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float')
-        out = np.asfarray([0.0], dtype=np.float32)
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -197,30 +219,36 @@ def test_random_normal(benchmark, mode, fp_type):
     if mode == 'numpy':
         state = np.random.Philox([SEED])
         prng = np.random.Generator(state)
+
         def f():
             return prng.standard_normal(dtype=dtype)
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state = init_fun.byref_arg_types[0]()
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal')
-        out = gen_fun.byref_arg_types[1]()
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal', numpy_args=(0, 1))
+
         def f():
+            out = gen_fun.np_buffer_for_arg(1)
             gen_fun(state, out)
-            return out.value
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
         init_fun.cuda_call(gpu_state, np.int64(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal')
-        out = np.array([0.0], dtype=np.dtype(gen_fun.byref_arg_types[1]))
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
+
         def f():
             gen_fun.cuda_call(gpu_state, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
@@ -228,36 +256,38 @@ def f():
     if fp_type is pnlvm.ir.DoubleType():
         np.testing.assert_allclose(res[0:2], [-0.2059740286292238, -0.12884495093462758])
         # 208 doesn't take the fast path but wraps around the main loop
-        np.testing.assert_allclose(res[207:211], [-0.768690647997579, 0.4301874289485477,
-                                          -0.7803640491708955, -1.146089287628737])
+        np.testing.assert_allclose(res[207:211],
+                                   [-0.768690647997579, 0.4301874289485477, -0.7803640491708955, -1.146089287628737])
         # 450 doesn't take the fast path or wrap around the main loop,
         # but takes the special condition at the end of the loop
-        np.testing.assert_allclose(res[449:453], [-0.7713655663874537, -0.5638348710823825,
-                                          -0.9415838853097869, 0.6212784278881248])
+        np.testing.assert_allclose(res[449:453],
+                                   [-0.7713655663874537, -0.5638348710823825, -0.9415838853097869, 0.6212784278881248])
         # 2013 takes the rare secondary loop and exists in the first iteration
         # taking the positive value
-        np.testing.assert_allclose(res[2011:2015], [0.4201922976982861, 2.7021541445373916,
-                                            3.7809967764329375, 0.19919094793393655])
+        np.testing.assert_allclose(res[2011:2015],
+                                   [0.4201922976982861, 2.7021541445373916, 3.7809967764329375, 0.19919094793393655])
         # 5136 takes the rare secondary loop and exists in the first iteration
         # taking the negative value
-        np.testing.assert_allclose(res[5134:5138], [0.12317411414687844, -0.17846827974421134,
-                                            -3.6579887696059714, 0.2501530374224693])
+        np.testing.assert_allclose(res[5134:5138],
+                                   [0.12317411414687844, -0.17846827974421134, -3.6579887696059714, 0.2501530374224693])
         # 190855 takes the rare secondary loop and needs more than one iteration
-        np.testing.assert_allclose(res[190853:190857], [-0.26418319904491194, 0.35889007879353746,
-                                                -3.843811523424439, -1.5256469840469997])
+        np.testing.assert_allclose(res[190853:190857],
+                                   [-0.26418319904491194, 0.35889007879353746, -3.843811523424439, -1.5256469840469997])
+
     elif fp_type is pnlvm.ir.FloatType():
         # The indices are taken from above and don't have special meaning.
         np.testing.assert_allclose(res[0:2], [-0.24822916090488434, -0.02676701545715332])
-        np.testing.assert_allclose(res[207:211], [-0.33086925745010376, -1.024695873260498,
-                                          -0.5162619352340698, -0.15033885836601257])
-        np.testing.assert_allclose(res[449:453], [-0.2223609834909439, 0.16769859194755554,
-                                          -0.7806711196899414, 0.5867824554443359])
-        np.testing.assert_allclose(res[2011:2015], [0.1979091316461563, -0.23467595875263214,
-                                            1.1458240747451782, -1.0285860300064087])
-        np.testing.assert_allclose(res[5134:5138], [-1.0523858070373535, -3.007537603378296,
-                                            -0.4331461489200592, -0.8841480612754822])
-        np.testing.assert_allclose(res[190853:190857], [-0.8958197236061096, 0.10532315075397491,
-                                                 2.000257730484009, -1.129721999168396])
+        np.testing.assert_allclose(res[207:211],
+                                   [-0.33086925745010376, -1.024695873260498, -0.5162619352340698, -0.15033885836601257])
+        np.testing.assert_allclose(res[449:453],
+                                   [-0.2223609834909439, 0.16769859194755554, -0.7806711196899414, 0.5867824554443359])
+        np.testing.assert_allclose(res[2011:2015],
+                                   [0.1979091316461563, -0.23467595875263214, 1.1458240747451782, -1.0285860300064087])
+        np.testing.assert_allclose(res[5134:5138],
+                                   [-1.0523858070373535, -3.007537603378296, -0.4331461489200592, -0.8841480612754822])
+        np.testing.assert_allclose(res[190853:190857],
+                                   [-0.8958197236061096, 0.10532315075397491, 2.000257730484009, -1.129721999168396])
+
     assert not any(np.isnan(res)), list(np.isnan(res)).index(True)
     benchmark(f)
 
@@ -287,35 +317,40 @@ def test_random_binomial(benchmark, mode, fp_type, n, p, exp_64, exp_32):
     if mode == 'numpy':
         state = np.random.Philox([SEED])
         prng = np.random.Generator(state)
+
         def f():
             return prng.binomial(n, p)
+
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        c_state = init_fun.byref_arg_types[0]()
-        init_fun(c_state, SEED)
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        state = init_fun.np_buffer_for_arg(0)
+        init_fun(state, SEED)
+
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial', numpy_args=(0, 1, 2, 3))
+        n = np.asarray(n, dtype=gen_fun.np_params[1])
+        p = np.asarray(p, dtype=gen_fun.np_params[2])
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial')
-        c_n = gen_fun.byref_arg_types[1](n)
-        c_p = gen_fun.byref_arg_types[2](p)
-        c_out = gen_fun.byref_arg_types[-1]()
         def f():
-            gen_fun(c_state, c_n, c_p, c_out)
-            return c_out.value
+            out = gen_fun.np_buffer_for_arg(1)
+            gen_fun(state, n, p, out)
+            return out
+
     elif mode == 'PTX':
         init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
-        state_size = ctypes.sizeof(init_fun.byref_arg_types[0])
+        state_size = init_fun.np_buffer_for_arg(0).nbytes
         gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size)
         init_fun.cuda_call(gpu_state, np.int64(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial')
-        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.array([n], dtype=np.dtype(gen_fun.byref_arg_types[1])))
-        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.array([p], dtype=np.dtype(gen_fun.byref_arg_types[2])))
-        out = np.array([0.0], dtype=np.dtype(gen_fun.byref_arg_types[3]))
+        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1]))
+        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2]))
+        out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
 
         def f():
             gen_fun.cuda_call(gpu_state, gpu_n, gpu_p, gpu_out)
-            return out[0]
+            return out.copy()
+
     else:
         assert False, "Unknown mode: {}".format(mode)
 
diff --git a/tests/llvm/test_builtins_vector.py b/tests/llvm/test_builtins_vector.py
index d840b7acba8..999a7e42696 100644
--- a/tests/llvm/test_builtins_vector.py
+++ b/tests/llvm/test_builtins_vector.py
@@ -6,6 +6,7 @@
 
 
 DIM_X=1500
+
 # These are just basic tests to check that vector indexing and operations
 # work correctly when compiled. The values don't matter much.
 # Might as well make them representable in fp32 for single precision testing.
@@ -13,13 +14,11 @@
 v = np.random.rand(DIM_X).astype(np.float32).astype(np.float64)
 scalar = np.random.rand()
 
-
 add_res = np.add(u, v)
 sub_res = np.subtract(u, v)
 mul_res = np.multiply(u, v)
 smul_res = np.multiply(u, scalar)
 
-
 @pytest.mark.benchmark(group="Hadamard")
 @pytest.mark.parametrize("op, v, builtin, result", [
                          (np.add, v, "__pnl_builtin_vec_add", add_res),
@@ -28,18 +27,21 @@
                          (np.multiply, scalar, "__pnl_builtin_vec_scalar_mult", smul_res),
                          ], ids=["ADD", "SUB", "MUL", "SMUL"])
 def test_vector_op(benchmark, op, v, builtin, result, func_mode):
+
+    def _numpy_args(bin_f):
+        np_u = u.astype(bin_f.np_params[0])
+        np_v = bin_f.np_params[1].type(v) if np.isscalar(v) else v.astype(bin_f.np_params[1])
+        np_res = np.empty_like(np_u)
+
+        return np_u, np_v, np_res
+
     if func_mode == 'Python':
         def ex():
             return op(u, v)
+
     elif func_mode == 'LLVM':
         bin_f = pnlvm.LLVMBinaryFunction.get(builtin)
-        dty = np.dtype(bin_f.byref_arg_types[0])
-        assert dty == np.dtype(bin_f.byref_arg_types[1])
-        assert dty == np.dtype(bin_f.byref_arg_types[3])
-
-        lu = u.astype(dty)
-        lv = dty.type(v) if np.isscalar(v) else v.astype(dty)
-        lres = np.empty_like(lu)
+        lu, lv, lres = _numpy_args(bin_f)
 
         ct_u = lu.ctypes.data_as(bin_f.c_func.argtypes[0])
         ct_v = lv if np.isscalar(lv) else lv.ctypes.data_as(bin_f.c_func.argtypes[1])
@@ -51,17 +53,12 @@ def ex():
 
     elif func_mode == 'PTX':
         bin_f = pnlvm.LLVMBinaryFunction.get(builtin)
-        dty = np.dtype(bin_f.byref_arg_types[0])
-        assert dty == np.dtype(bin_f.byref_arg_types[1])
-        assert dty == np.dtype(bin_f.byref_arg_types[3])
-
-        lu = u.astype(dty)
-        lv = dty.type(v) if np.isscalar(v) else v.astype(dty)
-        lres = np.empty_like(lu)
+        lu, lv, lres = _numpy_args(bin_f)
 
         cuda_u = pnlvm.jit_engine.pycuda.driver.In(lu)
         cuda_v = lv if np.isscalar(lv) else pnlvm.jit_engine.pycuda.driver.In(lv)
         cuda_res = pnlvm.jit_engine.pycuda.driver.Out(lres)
+
         def ex():
             bin_f.cuda_call(cuda_u, cuda_v, np.int32(DIM_X), cuda_res)
             return lres
@@ -72,30 +69,35 @@ def ex():
 
 @pytest.mark.benchmark(group="Sum")
 def test_vector_sum(benchmark, func_mode):
+
     if func_mode == 'Python':
         def ex():
             return np.sum(u)
+
     elif func_mode == 'LLVM':
-        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum")
+        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,))
 
-        lu = u.astype(np.dtype(bin_f.byref_arg_types[0]))
-        llvm_res = np.empty(1, dtype=lu.dtype)
+        np_u = u.astype(bin_f.np_params[0])
+        np_res = bin_f.np_buffer_for_arg(2)
 
-        ct_u = lu.ctypes.data_as(bin_f.c_func.argtypes[0])
-        ct_res = llvm_res.ctypes.data_as(bin_f.c_func.argtypes[2])
+        ct_u = np_u.ctypes.data_as(bin_f.c_func.argtypes[0])
 
         def ex():
-            bin_f(ct_u, DIM_X, ct_res)
-            return llvm_res[0]
+            bin_f(ct_u, DIM_X, np_res)
+            return np_res
+
     elif func_mode == 'PTX':
-        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum")
-        lu = u.astype(np.dtype(bin_f.byref_arg_types[0]))
-        cuda_u = pnlvm.jit_engine.pycuda.driver.In(lu)
-        res = np.empty(1, dtype=lu.dtype)
-        cuda_res = pnlvm.jit_engine.pycuda.driver.Out(res)
+        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,))
+
+        np_u = u.astype(bin_f.np_params[0])
+        np_res = bin_f.np_buffer_for_arg(2)
+
+        cuda_u = pnlvm.jit_engine.pycuda.driver.In(np_u)
+        cuda_res = pnlvm.jit_engine.pycuda.driver.Out(np_res)
+
         def ex():
             bin_f.cuda_call(cuda_u, np.int32(DIM_X), cuda_res)
-            return res[0]
+            return np_res
 
     res = benchmark(ex)
-    np.testing.assert_allclose(res, sum(u))
+    np.testing.assert_allclose(res, np.sum(u))
diff --git a/tests/llvm/test_compile.py b/tests/llvm/test_compile.py
index 406fc1e2430..c396cba594f 100644
--- a/tests/llvm/test_compile.py
+++ b/tests/llvm/test_compile.py
@@ -4,52 +4,48 @@
 
 from psyneulink.core import llvm as pnlvm
 
-ITERATIONS=100
 DIM_X=1000
 DIM_Y=2000
 
 @pytest.mark.llvm
 def test_recompile():
     # The original builtin mxv function
-    binf = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm')
-    dty = np.dtype(binf.byref_arg_types[0])
-    assert dty == np.dtype(binf.byref_arg_types[1])
-    assert dty == np.dtype(binf.byref_arg_types[4])
+    bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm')
 
-    matrix = np.random.rand(DIM_X, DIM_Y).astype(dty)
-    vector = np.random.rand(DIM_X).astype(dty)
-    llvm_res = np.empty(DIM_Y, dtype=dty)
+    vector = np.random.rand(DIM_X).astype(bin_f.np_params[0].base)
+    matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_params[1].base)
+    llvm_res = np.empty(DIM_Y, dtype=bin_f.np_params[4].base)
 
     x, y = matrix.shape
 
-    ct_vec = vector.ctypes.data_as(binf.c_func.argtypes[0])
-    ct_mat = matrix.ctypes.data_as(binf.c_func.argtypes[1])
+    ct_vec = vector.ctypes.data_as(bin_f.c_func.argtypes[0])
+    ct_mat = matrix.ctypes.data_as(bin_f.c_func.argtypes[1])
 
     orig_res = np.empty_like(llvm_res)
-    ct_res = orig_res.ctypes.data_as(binf.c_func.argtypes[4])
+    ct_res = orig_res.ctypes.data_as(bin_f.c_func.argtypes[4])
 
-    binf.c_func(ct_vec, ct_mat, x, y, ct_res)
+    bin_f.c_func(ct_vec, ct_mat, x, y, ct_res)
 
     # Rebuild and try again
     # This is not a public API
     pnlvm._llvm_build()
 
     rebuild_res = np.empty_like(llvm_res)
-    ct_res = rebuild_res.ctypes.data_as(binf.c_func.argtypes[4])
+    ct_res = rebuild_res.ctypes.data_as(bin_f.c_func.argtypes[4])
 
-    binf.c_func(ct_vec, ct_mat, x, y, ct_res)
+    bin_f.c_func(ct_vec, ct_mat, x, y, ct_res)
     assert np.array_equal(orig_res, rebuild_res)
 
     # Get a new pointer
-    binf2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm')
+    bin_f2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm')
     new_res = np.empty_like(llvm_res)
-    ct_res = new_res.ctypes.data_as(binf2.c_func.argtypes[4])
+    ct_res = new_res.ctypes.data_as(bin_f2.c_func.argtypes[4])
 
-    binf2.c_func(ct_vec, ct_mat, x, y, ct_res)
+    bin_f2.c_func(ct_vec, ct_mat, x, y, ct_res)
     assert np.array_equal(rebuild_res, new_res)
 
     callable_res = np.empty_like(llvm_res)
-    ct_res = callable_res.ctypes.data_as(binf.c_func.argtypes[4])
+    ct_res = callable_res.ctypes.data_as(bin_f.c_func.argtypes[4])
 
-    binf2(ct_vec, ct_mat, x, y, ct_res)
+    bin_f2(ct_vec, ct_mat, x, y, ct_res)
     assert np.array_equal(new_res, callable_res)
diff --git a/tests/llvm/test_helpers.py b/tests/llvm/test_helpers.py
index f2e2cb141e6..e692bd62f37 100644
--- a/tests/llvm/test_helpers.py
+++ b/tests/llvm/test_helpers.py
@@ -1,6 +1,5 @@
 import ctypes
 import ctypes.util
-import copy
 import numpy as np
 import pytest
 import sys
@@ -16,8 +15,7 @@
 VECTOR = np.random.rand(DIM_X)
 
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
 def test_helper_fclamp(mode):
 
     with pnlvm.LLVMBuilderContext.get_current() as ctx:
@@ -46,12 +44,13 @@ def test_helper_fclamp(mode):
 
     ref = np.clip(VECTOR, TST_MIN, TST_MAX)
     bounds = np.asfarray([TST_MIN, TST_MAX])
+
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
-    local_vec = copy.deepcopy(VECTOR)
+    local_vec = VECTOR.copy()
+
     if mode == 'CPU':
-        ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0])
-        ct_vec = local_vec.ctypes.data_as(ct_ty)
-        ct_bounds = bounds.ctypes.data_as(ct_ty)
+        ct_vec = local_vec.ctypes.data_as(bin_f.c_func.argtypes[0])
+        ct_bounds = bounds.ctypes.data_as(bin_f.c_func.argtypes[2])
 
         bin_f(ct_vec, DIM_X, ct_bounds)
     else:
@@ -61,8 +60,7 @@ def test_helper_fclamp(mode):
 
 
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
 def test_helper_fclamp_const(mode):
 
     with pnlvm.LLVMBuilderContext.get_current() as ctx:
@@ -85,12 +83,12 @@ def test_helper_fclamp_const(mode):
 
         builder.ret_void()
 
-    local_vec = copy.deepcopy(VECTOR)
+    local_vec = VECTOR.copy()
     ref = np.clip(VECTOR, TST_MIN, TST_MAX)
+
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
     if mode == 'CPU':
-        ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0])
-        ct_vec = local_vec.ctypes.data_as(ct_ty)
+        ct_vec = local_vec.ctypes.data_as(bin_f.c_func.argtypes[0])
 
         bin_f(ct_vec, DIM_X)
     else:
@@ -100,10 +98,8 @@ def test_helper_fclamp_const(mode):
 
 
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
-@pytest.mark.parametrize('rtol,atol',
-                         [[0, 0], [None, None], [None, 100], [2, None]])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
+@pytest.mark.parametrize('rtol,atol', [[0, 0], [None, None], [None, 100], [2, None]])
 @pytest.mark.parametrize('var1,var2',
                          [[1, 1], [1, 100], [1,2], [-4,5], [0, -100], [-1,-2],
                           [[1,1,1,-4,0,-1], [1,100,2,5,-100,-2]]
@@ -148,18 +144,16 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type):
 
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
 
-    dty = np.dtype(bin_f.byref_arg_types[0])
-    vec1 = np.atleast_1d(np.asfarray(var1, dtype=dty))
-    vec2 = np.atleast_1d(np.asfarray(var2, dtype=dty))
+    vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_params[0].base))
+    vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_params[1].base))
     assert len(vec1) == len(vec2)
     res = np.empty_like(vec2)
 
     ref = np.isclose(vec1, vec2, **tolerance)
     if mode == 'CPU':
-        ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0])
-        ct_vec1 = vec1.ctypes.data_as(ct_ty)
-        ct_vec2 = vec2.ctypes.data_as(ct_ty)
-        ct_res = res.ctypes.data_as(ct_ty)
+        ct_vec1 = vec1.ctypes.data_as(bin_f.c_func.argtypes[0])
+        ct_vec2 = vec2.ctypes.data_as(bin_f.c_func.argtypes[1])
+        ct_res = res.ctypes.data_as(bin_f.c_func.argtypes[2])
 
         bin_f(ct_vec1, ct_vec2, ct_res, len(res))
     else:
@@ -169,10 +163,8 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type):
 
 
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
-@pytest.mark.parametrize('rtol,atol',
-                         [[0, 0], [None, None], [None, 100], [2, None]])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
+@pytest.mark.parametrize('rtol,atol', [[0, 0], [None, None], [None, 100], [2, None]])
 @pytest.mark.parametrize('var1,var2',
                          [[1, 1], [1, 100], [1,2], [-4,5], [0, -100], [-1,-2],
                           [[1,1,1,-4,0,-1], [1,100,2,5,-100,-2]]
@@ -191,8 +183,7 @@ def test_helper_all_close(mode, var1, var2, atol, rtol):
 
     with pnlvm.LLVMBuilderContext.get_current() as ctx:
         arr_ptr_ty = ir.ArrayType(ir.DoubleType(), len(vec1)).as_pointer()
-        func_ty = ir.FunctionType(ir.VoidType(), [arr_ptr_ty, arr_ptr_ty,
-                                                  ir.IntType(32).as_pointer()])
+        func_ty = ir.FunctionType(ir.VoidType(), [arr_ptr_ty, arr_ptr_ty, ir.IntType(32).as_pointer()])
 
         custom_name = ctx.get_unique_name("all_close")
         function = ir.Function(ctx.module, func_ty, name=custom_name)
@@ -207,18 +198,14 @@ def test_helper_all_close(mode, var1, var2, atol, rtol):
 
 
     ref = np.allclose(vec1, vec2, **tolerance)
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
-    if mode == 'CPU':
-        ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0])
-        ct_vec1 = vec1.ctypes.data_as(ct_ty)
-        ct_vec2 = vec2.ctypes.data_as(ct_ty)
-        res = ctypes.c_uint32()
+    res = np.array(5, dtype=np.uint32)
+
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2))
 
-        bin_f(ct_vec1, ct_vec2, ctypes.byref(res))
+    if mode == 'CPU':
+        bin_f(vec1, vec2, res)
     else:
-        res = np.array([5], dtype=np.uint32)
         bin_f.cuda_wrap_call(vec1, vec2, res)
-        res = res[0]
 
     assert np.array_equal(res, ref)
 
@@ -425,9 +412,9 @@ def test_helper_get_array_shape(self, ir_type, expected):
     def test_helper_array_from_shape(self, ir_type, shape):
         assert ir_type == pnlvm.helpers.array_from_shape(shape, self.DOUBLE_TYPE)
 
+
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
 @pytest.mark.parametrize('op,var,expected', [
     (pnlvm.helpers.tanh, 1.0, 0.7615941559557649),
     (pnlvm.helpers.exp, 1.0, 2.718281828459045),
@@ -436,8 +423,7 @@ def test_helper_array_from_shape(self, ir_type, shape):
     (pnlvm.helpers.log, 1.0, 0.0),
     (pnlvm.helpers.log1p, 1.0, 0.6931471805599453),
 ])
-@pytest.mark.parametrize('fp_type', [pnlvm.ir.DoubleType(), pnlvm.ir.FloatType()],
-                         ids=lambda x: str(x))
+@pytest.mark.parametrize('fp_type', [pnlvm.ir.DoubleType(), pnlvm.ir.FloatType()], ids=str)
 def test_helper_numerical(mode, op, var, expected, fp_type):
     with pnlvm.LLVMBuilderContext(fp_type) as ctx:
         func_ty = ir.FunctionType(ir.VoidType(), [ctx.float_ty.as_pointer()])
@@ -454,20 +440,19 @@ def test_helper_numerical(mode, op, var, expected, fp_type):
 
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0,))
+
+    res = np.asfarray(var, dtype=bin_f.np_params[0])
+
     if mode == 'CPU':
-        res = bin_f.byref_arg_types[0](var)
-        bin_f(ctypes.byref(res))
-        res = res.value
+        bin_f(res)
     else:
-        res = np.ctypeslib.as_array(bin_f.byref_arg_types[0](var))
         bin_f.cuda_wrap_call(res)
 
     np.testing.assert_allclose(res, expected)
 
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
 @pytest.mark.parametrize('var,expected', [
     (np.asfarray([1,2,3]), np.asfarray([2,3,4])),
     (np.asfarray([[1,2],[3,4]]), np.asfarray([[2,3],[4,5]])),
@@ -488,26 +473,20 @@ def test_helper_elementwise_op(mode, var, expected):
             lambda ctx, builder, x: builder.fadd(x.type(1.0), x), out)
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1))
 
-    # convert input to the right type
-    dt = np.dtype(bin_f.byref_arg_types[0])
-    dt = np.empty(1, dtype=dt).flatten().dtype
-    var = var.astype(dt)
+    vec = np.asfarray(var, dtype=bin_f.np_params[0].base)
+    res = bin_f.np_buffer_for_arg(1)
 
     if mode == 'CPU':
-        ct_vec = np.ctypeslib.as_ctypes(var)
-        res = bin_f.byref_arg_types[1]()
-        bin_f(ct_vec, ctypes.byref(res))
+        bin_f(vec, res)
     else:
-        res = np.empty_like(var)
-        bin_f.cuda_wrap_call(var, res)
+        bin_f.cuda_wrap_call(vec, res)
 
     assert np.array_equal(res, expected)
 
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
 @pytest.mark.parametrize('var1,var2,expected', [
     (np.array([1.,2.,3.]), np.array([1.,2.,3.]), np.array([2.,4.,6.])),
     (np.array([1.,2.,3.]), np.array([0.,1.,2.]), np.array([1.,3.,5.])),
@@ -537,24 +516,19 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected):
             a = builder.load(a_ptr)
             b = builder.load(b_ptr)
             builder.store(builder.fadd(a,b), o_ptr)
+
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2))
 
-    # convert input to the right type
-    dt = np.dtype(bin_f.byref_arg_types[0])
-    dt = np.empty(1, dtype=dt).flatten().dtype
-    var1 = var1.astype(dt)
-    var2 = var2.astype(dt)
+    vec1 = np.asfarray(var1, dtype=bin_f.np_params[0].base)
+    vec2 = np.asfarray(var2, dtype=bin_f.np_params[0].base)
+    res = bin_f.np_buffer_for_arg(1)
 
     if mode == 'CPU':
-        ct_vec = np.ctypeslib.as_ctypes(var1)
-        ct_vec_2 = np.ctypeslib.as_ctypes(var2)
-        res = bin_f.byref_arg_types[2]()
-        bin_f(ct_vec, ct_vec_2, ctypes.byref(res))
+        bin_f(vec1, vec2, res)
     else:
-        res = np.empty_like(var1)
-        bin_f.cuda_wrap_call(var1, var2, res)
+        bin_f.cuda_wrap_call(vec1, vec2, res)
 
     assert np.array_equal(res, expected)
 
@@ -563,8 +537,7 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected):
 
 
 @pytest.mark.llvm
-@pytest.mark.parametrize('mode', ['CPU',
-                                  pytest.param('PTX', marks=pytest.mark.cuda)])
+@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')])
 @pytest.mark.parametrize('t1', _fp_types)
 @pytest.mark.parametrize('t2', _fp_types)
 @pytest.mark.parametrize('val', [1.0, '-Inf', 'Inf', 'NaN', 16777216, 16777217, -1.0])
@@ -582,21 +555,18 @@ def test_helper_convert_fp_type(t1, t2, mode, val):
         builder.store(conv_x, y)
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1))
 
-    # Convert type to numpy dtype
-    npt1, npt2 = (np.dtype(bin_f.byref_arg_types[x]) for x in (0, 1))
-    npt1, npt2 = (np.float16().dtype if x == np.uint16 else x for x in (npt1, npt2))
+    # Get the argument numpy dtype
+    np_dt1, np_dt2 = (np.dtype(bin_f.np_params[i]) for i in (0, 1))
 
     # instantiate value, result and reference
-    x = np.asfarray(val, dtype=npt1)
-    y = np.asfarray(np.random.rand(), dtype=npt2)
-    ref = x.astype(npt2)
+    x = np.asfarray(val, dtype=np_dt1)
+    y = np.asfarray(0, dtype=np_dt2)
+    ref = x.astype(np_dt2)
 
     if mode == 'CPU':
-        ct_x = x.ctypes.data_as(bin_f.c_func.argtypes[0])
-        ct_y = y.ctypes.data_as(bin_f.c_func.argtypes[1])
-        bin_f(ct_x, ct_y)
+        bin_f(x, y)
     else:
         bin_f.cuda_wrap_call(x, y)