diff --git a/dev_requirements.txt b/dev_requirements.txt index 645d3b8eee6..afc8bbefd42 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,6 +1,6 @@ jupyter<1.0.1 packaging<25.0 -pytest<8.3.2 +pytest<8.3.3 pytest-benchmark<4.0.1 pytest-cov<5.0.1 pytest-forked<1.7.0 diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py index 2e157d7b324..5ff6359225e 100644 --- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py +++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py @@ -2096,44 +2096,33 @@ def _function(self, # if ocm is not None and ocm.parameters.comp_execution_mode._get(context) in {"PTX", "LLVM"}: if ocm is not None and ocm.parameters.comp_execution_mode._get(context) in {"PTX", "LLVM"}: - # If we have a numpy array, convert back to ctypes - if isinstance(all_values, np.ndarray): - ct_values = all_values.flatten().ctypes.data_as(ctypes.POINTER(ctypes.c_double)) - num_values = len(all_values.flatten()) - else: - ct_values = all_values - num_values = len(ct_values) + ct_values = all_values + num_values = len(ct_values) # Reduce array of values to min/max # select_min params are: # params, state, min_sample_ptr, sample_ptr, min_value_ptr, value_ptr, opt_count_ptr, count min_tags = frozenset({"select_min", "evaluate_type_objective"}) - bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags) + bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, numpy_args=(2, 4, 6)) + ct_param = bin_func.byref_arg_types[0](*self._get_param_initializer(context)) ct_state = bin_func.byref_arg_types[1](*self._get_state_initializer(context)) - ct_opt_sample = bin_func.byref_arg_types[2](float("NaN")) - ct_alloc = None # NULL for samples - ct_opt_value = bin_func.byref_arg_types[4]() - ct_opt_count = bin_func.byref_arg_types[6](0) - ct_start = bin_func.c_func.argtypes[7](0) - ct_stop = bin_func.c_func.argtypes[8](num_values) - - bin_func(ct_param, ct_state, ct_opt_sample, ct_alloc, ct_opt_value, - ct_values, ct_opt_count, ct_start, ct_stop) - - optimal_value = ct_opt_value.value - optimal_sample = np.ctypeslib.as_array(ct_opt_sample) - - if not isinstance(all_values, np.ndarray): - all_values = np.ctypeslib.as_array(ct_values) - - # These are normally stored in the parent function (OptimizationFunction). - # Since we didn't call super()._function like the python path, - # save the values here - if self.parameters.save_samples._get(context): - self.parameters.saved_samples._set(all_samples, context) - if self.parameters.save_values._get(context): - self.parameters.saved_values._set(all_values, context) + optimal_sample = bin_func.np_buffer_for_arg(2) + optimal_value = bin_func.np_buffer_for_arg(4) + number_of_optimal_values = bin_func.np_buffer_for_arg(6, fill_value=0) + + bin_func(ct_param, + ct_state, + optimal_sample, + None, # samples. NULL, it's generated by the function. + optimal_value, + ct_values, + number_of_optimal_values, + bin_func.c_func.argtypes[7](0), # start + bin_func.c_func.argtypes[8](num_values)) # stop + + # Convert outputs to Numpy/Python + all_values = np.ctypeslib.as_array(ct_values) # Python version else: @@ -2153,6 +2142,12 @@ def _function(self, [all_samples[:,i] for i in range(all_samples.shape[1])]) optimal_value, optimal_sample = next(value_sample_pairs) + # The algorithm below implements "Reservoir sampling"[0]. This + # matches the compiled implementation of "select_min". The + # advantage of reservoir sampling is constant memory requirements + # and a single pass over the evaluated values. + # The disadvantage is multiple calls to the PRNG. + # https://en.wikipedia.org/wiki/Reservoir_sampling select_randomly = self.parameters.select_randomly_from_optimal_values._get(context) for value, sample in value_sample_pairs: if select_randomly and np.allclose(value, optimal_value): diff --git a/psyneulink/core/compositions/composition.py b/psyneulink/core/compositions/composition.py index 804ed9f6f45..79177e1c3b2 100644 --- a/psyneulink/core/compositions/composition.py +++ b/psyneulink/core/compositions/composition.py @@ -11681,7 +11681,8 @@ def _execute_controller(self, assert (execution_mode == pnlvm.ExecutionMode.LLVM or execution_mode & pnlvm.ExecutionMode._Fallback),\ f"PROGRAM ERROR: Unrecognized compiled execution_mode: '{execution_mode}'." - _comp_ex.execute_node(self.controller, context=context) + _comp_ex.freeze_values() + _comp_ex.execute_node(self.controller) context.remove_flag(ContextFlags.PROCESSING) @@ -12010,7 +12011,7 @@ def execute( build_CIM_input = self._build_variable_for_input_CIM(inputs) if execution_mode & pnlvm.ExecutionMode.COMPILED: - _comp_ex.execute_node(self.input_CIM, inputs, context) + _comp_ex.execute_node(self.input_CIM, inputs) # FIXME: parameter_CIM should be executed here as well, # but node execution of nested compositions with # outside control is not supported yet. @@ -12295,7 +12296,7 @@ def execute( # Execute Mechanism if execution_mode & pnlvm.ExecutionMode.COMPILED: - _comp_ex.execute_node(node, context=context) + _comp_ex.execute_node(node) else: if node is not self.controller: mech_context = copy(context) @@ -12507,7 +12508,7 @@ def execute( # Extract result here if execution_mode & pnlvm.ExecutionMode.COMPILED: _comp_ex.freeze_values() - _comp_ex.execute_node(self.output_CIM, context=context) + _comp_ex.execute_node(self.output_CIM) report(self, PROGRESS_REPORT, report_num=report_num, diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py index bd931413e9b..568ef7ec910 100644 --- a/psyneulink/core/llvm/__init__.py +++ b/psyneulink/core/llvm/__init__.py @@ -23,7 +23,7 @@ from . import codegen from .builder_context import * -from .builder_context import _all_modules, _convert_llvm_ir_to_ctype +from .builder_context import _all_modules, _convert_llvm_ir_to_ctype, _convert_llvm_ir_to_dtype from .debug import debug_env from .execution import * from .execution import _tupleize @@ -123,7 +123,7 @@ def _llvm_build(target_generation=_binary_generation + 1): class LLVMBinaryFunction: - def __init__(self, name: str): + def __init__(self, name: str, *, numpy_args=()): self.name = name self.__c_func = None @@ -143,17 +143,25 @@ def __init__(self, name: str): # Create ctype function instance start = time.perf_counter() return_type = _convert_llvm_ir_to_ctype(f.return_value.type) - params = [_convert_llvm_ir_to_ctype(a.type) for a in f.args] + args = [_convert_llvm_ir_to_ctype(a.type) for a in f.args] + + # '_type_' special attribute stores pointee type for pointers + # https://docs.python.org/3/library/ctypes.html#ctypes._Pointer._type_ + self.byref_arg_types = [a._type_ if hasattr(a, "contents") else None for a in args] + self.np_params = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args] + + for a in numpy_args: + assert self.byref_arg_types[a] is not None + args[a] = np.ctypeslib.ndpointer(dtype=self.np_params[a].base, shape=self.np_params[a].shape) + middle = time.perf_counter() - self.__c_func_type = ctypes.CFUNCTYPE(return_type, *params) + self.__c_func_type = ctypes.CFUNCTYPE(return_type, *args) finish = time.perf_counter() if "time_stat" in debug_env: print("Time to create ctype function '{}': {} ({} to create types)".format( name, finish - start, middle - start)) - self.byref_arg_types = [p._type_ for p in params] - @property def c_func(self): if self.__c_func is None: @@ -218,18 +226,26 @@ def cuda_wrap_call(self, *args, **kwargs): wrap_args = (jit_engine.pycuda.driver.InOut(a) if isinstance(a, np.ndarray) else a for a in args) self.cuda_call(*wrap_args, **kwargs) + def np_buffer_for_arg(self, arg_num, *, extra_dimensions=(), fill_value=np.nan): + + out_base = self.np_params[arg_num].base + out_shape = extra_dimensions + self.np_params[arg_num].shape + + # fill the buffer with NaN poison + return np.full(out_shape, fill_value, dtype=out_base) + @staticmethod @functools.lru_cache(maxsize=32) - def from_obj(obj, *, tags:frozenset=frozenset()): + def from_obj(obj, *, tags:frozenset=frozenset(), numpy_args:tuple=()): name = LLVMBuilderContext.get_current().gen_llvm_function(obj, tags=tags).name - return LLVMBinaryFunction.get(name) + return LLVMBinaryFunction.get(name, numpy_args=numpy_args) @staticmethod @functools.lru_cache(maxsize=32) - def get(name: str): - return LLVMBinaryFunction(name) + def get(name: str, *, numpy_args:tuple=()): + return LLVMBinaryFunction(name, numpy_args=numpy_args) - def get_multi_run(self): + def get_multi_run(self, *, numpy_args=()): try: multirun_llvm = _find_llvm_function(self.name + "_multirun") except ValueError: @@ -237,7 +253,7 @@ def get_multi_run(self): with LLVMBuilderContext.get_current() as ctx: multirun_llvm = codegen.gen_multirun_wrapper(ctx, function) - return LLVMBinaryFunction.get(multirun_llvm.name) + return LLVMBinaryFunction.get(multirun_llvm.name, numpy_args=numpy_args) _cpu_engine = None diff --git a/psyneulink/core/llvm/builder_context.py b/psyneulink/core/llvm/builder_context.py index a4dd418f6f7..edc77fddad9 100644 --- a/psyneulink/core/llvm/builder_context.py +++ b/psyneulink/core/llvm/builder_context.py @@ -52,7 +52,7 @@ def module_count(): 'mt_rand_init', 'philox_rand_init')) -class _node_wrapper(): +class _node_assembly(): def __init__(self, composition, node): self._comp = weakref.proxy(composition) self._node = node @@ -61,7 +61,7 @@ def __repr__(self): return "Node wrapper for node '{}' in composition '{}'".format(self._node, self._comp) def _gen_llvm_function(self, *, ctx, tags:frozenset): - return codegen.gen_node_wrapper(ctx, self._comp, self._node, tags=tags) + return codegen.gen_node_assembly(ctx, self._comp, self._node, tags=tags) def _comp_cached(func): @functools.wraps(func) @@ -349,6 +349,13 @@ def get_state_space(self, builder, component, state_ptr, param): return helpers.get_state_space(builder, component, state_ptr, param_name) def check_used_params(self, component, *, tags:frozenset): + """ + This function checks that parameters included in the compiled structures are used in compiled code. + + If the assertion in this function triggers the parameter name should be added to the parameter + block list in the Component class. + """ + # Skip the check if the parameter use is not tracked. Some components (like node wrappers) # don't even have parameters. if component not in self._component_state_use and component not in self._component_param_use: @@ -378,12 +385,6 @@ def check_used_params(self, component, *, tags:frozenset): if hasattr(component, 'evaluate_agent_rep'): used_param_ids.add('num_trials_per_estimate') - if hasattr(component, 'adapt_scale'): - used_param_ids.add('threshold') - used_param_ids.add('adapt_scale') - used_param_ids.add('adapt_base') - used_param_ids.add('adapt_entropy_weighting') - unused_param_ids = component_param_ids - used_param_ids - initializers unused_state_ids = component_state_ids - used_state_ids @@ -504,38 +505,37 @@ def get_data_struct_type(self, component): return ir.LiteralStructType([]) - def get_node_wrapper(self, composition, node): - cache = getattr(composition, '_wrapped_nodes', None) + def get_node_assembly(self, composition, node): + cache = getattr(composition, '_node_assemblies', None) if cache is None: cache = weakref.WeakKeyDictionary() - setattr(composition, '_wrapped_nodes', cache) - return cache.setdefault(node, _node_wrapper(composition, node)) + setattr(composition, '_node_assemblies', cache) + return cache.setdefault(node, _node_assembly(composition, node)) def convert_python_struct_to_llvm_ir(self, t): self._stats["types_converted"] += 1 if t is None: return ir.LiteralStructType([]) - elif type(t) is list: - if len(t) == 0: - return ir.LiteralStructType([]) - elems_t = [self.convert_python_struct_to_llvm_ir(x) for x in t] - if all(x == elems_t[0] for x in elems_t): - return ir.ArrayType(elems_t[0], len(elems_t)) - return ir.LiteralStructType(elems_t) - elif type(t) is tuple: + + elif isinstance(t, (list, tuple)): elems_t = [self.convert_python_struct_to_llvm_ir(x) for x in t] if len(elems_t) > 0 and all(x == elems_t[0] for x in elems_t): return ir.ArrayType(elems_t[0], len(elems_t)) + return ir.LiteralStructType(elems_t) + elif isinstance(t, enum.Enum): # FIXME: Consider enums of non-int type assert all(round(x.value) == x.value for x in type(t)) return self.int32_ty + elif isinstance(t, (int, float, np.floating)): return self.float_ty + elif isinstance(t, np.integer): # Python 'int' is handled above as it is the default type for '0' return ir.IntType(t.nbytes * 8) + elif isinstance(t, np.ndarray): # 0d uint32 values were likely created from enums (above) and are # observed here after compilation sync. @@ -543,18 +543,24 @@ def convert_python_struct_to_llvm_ir(self, t): if t.ndim == 0 and t.dtype == np.uint32: return self.convert_python_struct_to_llvm_ir(t.reshape(1)[0]) return self.convert_python_struct_to_llvm_ir(t.tolist()) + elif isinstance(t, np.random.RandomState): return pnlvm.builtins.get_mersenne_twister_state_struct(self) + elif isinstance(t, np.random.Generator): assert isinstance(t.bit_generator, np.random.Philox) return pnlvm.builtins.get_philox_state_struct(self) + elif isinstance(t, Time): return ir.ArrayType(self.int32_ty, len(TimeScale)) + elif isinstance(t, SampleIterator): if isinstance(t.generator, list): return ir.ArrayType(self.float_ty, len(t.generator)) + # Generic iterator is {start, increment, count} return ir.LiteralStructType((self.float_ty, self.float_ty, self.int32_ty)) + assert False, "Don't know how to convert {}".format(type(t)) @@ -765,3 +771,53 @@ def _convert_llvm_ir_to_ctype(t: ir.Type): assert False, "Don't know how to convert LLVM type: {}".format(t) return ret_t + +@functools.lru_cache(maxsize=16) +def _convert_llvm_ir_to_dtype(t: ir.Type): + + if isinstance(t, ir.IntType): + if t.width == 8: + return np.uint8().dtype + + elif t.width == 16: + return np.uint16().dtype + + elif t.width == 32: + return np.uint32().dtype + + elif t.width == 64: + return np.uint64().dtype + + else: + assert False, "Unsupported integer type: {}".format(type(t)) + + elif isinstance(t, ir.DoubleType): + return np.float64().dtype + + elif isinstance(t, ir.FloatType): + return np.float32().dtype + + elif isinstance(t, ir.HalfType): + return np.float16().dtype + + elif isinstance(t, ir.ArrayType): + element_type = _convert_llvm_ir_to_dtype(t.element) + + # Create multidimensional array instead of nesting + if element_type.subdtype is not None: + element_type, shape = element_type.subdtype + else: + shape = () + + ret_t = np.dtype((element_type, (len(t),) + shape)) + + elif isinstance(t, ir.LiteralStructType): + field_list = [] + for i, e in enumerate(t.elements): + field_list.append(("field_" + str(i), _convert_llvm_ir_to_dtype(e))) + + ret_t = np.dtype(field_list, align=True) + else: + assert False, "Don't know how to convert LLVM type to dtype: {}".format(t) + + return ret_t diff --git a/psyneulink/core/llvm/codegen.py b/psyneulink/core/llvm/codegen.py index 16eca1c8ddb..df792ce5fe9 100644 --- a/psyneulink/core/llvm/codegen.py +++ b/psyneulink/core/llvm/codegen.py @@ -585,9 +585,9 @@ def find_max(builder, x): return res -def gen_node_wrapper(ctx, composition, node, *, tags:frozenset): - assert "node_wrapper" in tags - func_tags = tags.difference({"node_wrapper"}) +def gen_node_assembly(ctx, composition, node, *, tags:frozenset): + assert "node_assembly" in tags + func_tags = tags.difference({"node_assembly"}) node_function = ctx.import_llvm_function(node, tags=func_tags) # FIXME: This is a hack @@ -782,14 +782,14 @@ def _gen_composition_exec_context(ctx, composition, *, tags:frozenset, suffix="" params = builder.alloca(const_params.type, name="const_params_loc") builder.store(const_params, params) - node_tags = tags.union({"node_wrapper"}) + node_tags = tags.union({"node_assembly"}) # Call input CIM - input_cim_w = ctx.get_node_wrapper(composition, composition.input_CIM) + input_cim_w = ctx.get_node_assembly(composition, composition.input_CIM) input_cim_f = ctx.import_llvm_function(input_cim_w, tags=node_tags) builder.call(input_cim_f, [state, params, comp_in, data, data]) # Call parameter CIM - param_cim_w = ctx.get_node_wrapper(composition, composition.parameter_CIM) + param_cim_w = ctx.get_node_assembly(composition, composition.parameter_CIM) param_cim_f = ctx.import_llvm_function(param_cim_w, tags=node_tags) builder.call(param_cim_f, [state, params, comp_in, data, data]) @@ -803,7 +803,7 @@ def _gen_composition_exec_context(ctx, composition, *, tags:frozenset, suffix="" def gen_composition_exec(ctx, composition, *, tags:frozenset): simulation = "simulation" in tags - node_tags = tags.union({"node_wrapper"}) + node_tags = tags.union({"node_assembly"}) with _gen_composition_exec_context(ctx, composition, tags=tags) as (builder, data, params, cond_gen): state, _, comp_in, _, cond = builder.function.args @@ -823,7 +823,7 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset): is_finished_callbacks = {} for node in composition.nodes: args = [state, params, comp_in, data, output_storage] - wrapper = ctx.get_node_wrapper(composition, node) + wrapper = ctx.get_node_assembly(composition, node) is_finished_callbacks[node] = (wrapper, args) @@ -851,14 +851,14 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset): num_exec_locs, nodes_states) with builder.if_then(reinit_cond): - node_w = ctx.get_node_wrapper(composition, node) + node_w = ctx.get_node_assembly(composition, node) node_reinit_f = ctx.import_llvm_function(node_w, tags=node_tags.union({"reset"})) builder.call(node_reinit_f, [state, params, comp_in, data, data]) # Run controller if it's enabled in 'BEFORE' mode if simulation is False and composition.enable_controller and composition.controller_mode == BEFORE: assert composition.controller is not None - controller_w = ctx.get_node_wrapper(composition, composition.controller) + controller_w = ctx.get_node_assembly(composition, composition.controller) controller_f = ctx.import_llvm_function(controller_w, tags=node_tags) builder.call(controller_f, [state, params, comp_in, data, data]) @@ -929,7 +929,7 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset): run_set_node_ptr = builder.gep(run_set_ptr, [zero, ctx.int32_ty(idx)]) node_cond = builder.load(run_set_node_ptr, name="node_" + node.name + "_should_run") with builder.if_then(node_cond): - node_w = ctx.get_node_wrapper(composition, node) + node_w = ctx.get_node_assembly(composition, node) node_f = ctx.import_llvm_function(node_w, tags=node_tags) builder.block.name = "invoke_" + node_f.name # Wrappers do proper indexing of all structures @@ -984,12 +984,12 @@ def gen_composition_exec(ctx, composition, *, tags:frozenset): if simulation is False and composition.enable_controller and \ composition.controller_mode == AFTER: assert composition.controller is not None - controller_w = ctx.get_node_wrapper(composition, composition.controller) + controller_w = ctx.get_node_assembly(composition, composition.controller) controller_f = ctx.import_llvm_function(controller_w, tags=node_tags) builder.call(controller_f, [state, params, comp_in, data, data]) # Call output CIM - output_cim_w = ctx.get_node_wrapper(composition, composition.output_CIM) + output_cim_w = ctx.get_node_assembly(composition, composition.output_CIM) output_cim_f = ctx.import_llvm_function(output_cim_w, tags=node_tags) builder.block.name = "invoke_" + output_cim_f.name builder.call(output_cim_f, [state, params, comp_in, data, data]) @@ -1180,9 +1180,9 @@ def gen_autodiffcomp_exec(ctx, composition, *, tags:frozenset): pytorch_func = ctx.import_llvm_function(pytorch_model, tags=tags) builder.call(pytorch_func, [state, params, data]) - node_tags = tags.union({"node_wrapper"}) + node_tags = tags.union({"node_assembly"}) # Call output CIM - output_cim_w = ctx.get_node_wrapper(composition, composition.output_CIM) + output_cim_w = ctx.get_node_assembly(composition, composition.output_CIM) output_cim_f = ctx.import_llvm_function(output_cim_w, tags=node_tags) builder.call(output_cim_f, [state, params, comp_in, data, data]) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index 0d05164887d..f90919b97bc 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -48,25 +48,6 @@ def _tupleize(x): except TypeError: return x if x is not None else tuple() -def _element_dtype(x): - """ - Extract base builtin type from aggregate type. - - Throws assertion failure if the aggregate type includes more than one base type. - The assumption is that array of builtin type has the same binary layout as - the original aggregate and it's easier to construct - """ - dt = np.dtype(x) - while dt.subdtype is not None: - dt = dt.subdtype[0] - - if not dt.isbuiltin: - fdts = (_element_dtype(f[0]) for f in dt.fields.values()) - dt = next(fdts) - assert all(dt == fdt for fdt in fdts) - - assert dt.isbuiltin, "Element type is not builtin: {} from {}".format(dt, np.dtype(x)) - return dt def _pretty_size(size): units = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB'] @@ -100,7 +81,9 @@ def _get_compilation_param(self, name, init_method, arg): struct = struct_ty(*initializer) struct_end = time.time() - numpy_struct = np.ctypeslib.as_array(struct) + # numpy "frombuffer" creates a shared memory view of the provided buffer + numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=len(self._execution_contexts)) + assert numpy_struct.nbytes == ctypes.sizeof(struct), \ "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct)) @@ -120,6 +103,8 @@ def _get_compilation_param(self, name, init_method, arg): if len(self._execution_contexts) == 1: + numpy_struct.shape = () + if name == '_state': self._copy_params_to_pnl(self._execution_contexts[0], self._obj, @@ -232,13 +217,16 @@ def _enumerate_recurse(elements): pnl_param.set(value, context=context, override=True, compilation_sync=True) + def _get_indexable(self, np_array): + # outputs in recarrays need to be converted to list/tuple to be indexable + return np_array.tolist() if np_array.dtype.base.shape == () else np_array class CUDAExecution(Execution): - def __init__(self, buffers=['param_struct', 'state_struct', 'out']): + def __init__(self, buffers=['param_struct', 'state_struct']): super().__init__() - self._gpu_buffers = {} - for b in buffers: - self._gpu_buffers["_" + b] = None + + # Initialize GPU buffer map + self._gpu_buffers = {"_" + b: None for b in buffers} @property def _bin_func_multirun(self): @@ -253,7 +241,7 @@ def __get_cuda_arg(self, struct_name, arg_handler): # .array is a public member of pycuda's In/Out ArgumentHandler classes if gpu_buffer is None or gpu_buffer.array is not np_struct: - # 0-sized structures fail to upload use a dummy numpy array isntead + # 0-sized structures fail to upload use a dummy numpy array instead gpu_buffer = arg_handler(np_struct if np_struct.nbytes > 0 else np.zeros(2)) self._gpu_buffers[struct_name] = gpu_buffer @@ -276,54 +264,41 @@ def _cuda_data_struct(self): def _cuda_conditions(self): return self.__get_cuda_arg("_conditions", jit_engine.pycuda.driver.InOut) - @property - def _cuda_out(self): - gpu_buffer = self._gpu_buffers["_out"] - if gpu_buffer is None: - gpu_buffer = jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(self._ct_vo)) - self._gpu_buffers["_out"] = gpu_buffer - - return gpu_buffer - def cuda_execute(self, variable): - # Create input argument - new_var = np.asfarray(variable, dtype=self._vi_dty) + # Create input argument, PyCUDA doesn't care about shape + new_var = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) data_in = jit_engine.pycuda.driver.In(new_var) + extra_dims = (len(self._execution_contexts),) if len(self._execution_contexts) > 1 else () + data_out = self._bin_func.np_buffer_for_arg(3, extra_dimensions=extra_dims) + self._bin_func.cuda_call(self._cuda_param_struct, self._cuda_state_struct, data_in, - self._cuda_out, + jit_engine.pycuda.driver.Out(data_out), threads=len(self._execution_contexts)) - return _convert_ctype_to_python(self._ct_vo) + return self._get_indexable(data_out) class FuncExecution(CUDAExecution): def __init__(self, component, execution_ids=[None], *, tags=frozenset()): super().__init__() - self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags) + + self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags, numpy_args=(0, 1, 2, 3)) self._execution_contexts = [ Context(execution_id=eid) for eid in execution_ids ] self._component = component - _, _, vi_ty, vo_ty = self._bin_func.byref_arg_types if len(execution_ids) > 1: self._bin_multirun = self._bin_func.get_multi_run() self._ct_len = ctypes.c_int(len(execution_ids)) - vo_ty = vo_ty * len(execution_ids) - vi_ty = vi_ty * len(execution_ids) - self._ct_vo = vo_ty() - self._vi_dty = _element_dtype(vi_ty) - if "stat" in self._debug_env: - print("Input struct size:", _pretty_size(ctypes.sizeof(vi_ty)), - "for", self._component.name) - print("Output struct size:", _pretty_size(ctypes.sizeof(vo_ty)), - "for", self._component.name) + vo_ty = self._bin_func.byref_arg_types[3] * len(execution_ids) + self._ct_vo = vo_ty() @property def _obj(self): @@ -338,36 +313,29 @@ def _state_struct(self): return self._get_compilation_param('_state', '_get_state_initializer', 1) def execute(self, variable): - # Make sure function inputs are 2d. - # Mechanism inputs are already 3d so the first part is nop. - new_variable = np.asfarray(np.atleast_2d(variable), - dtype=self._vi_dty) + new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) - ct_vi = np.ctypeslib.as_ctypes(new_variable) if len(self._execution_contexts) > 1: - # wrap_call casts the arguments so we only need contiguous data - # layout + # wrap_call casts the arguments so we only need contiguous data layout + ct_vi = np.ctypeslib.as_ctypes(new_variable) + self._bin_multirun.wrap_call(self._param_struct[0], self._state_struct[0], ct_vi, self._ct_vo, self._ct_len) + return _convert_ctype_to_python(self._ct_vo) else: - self._bin_func(self._param_struct[0], self._state_struct[0], ct_vi, self._ct_vo) + data_out = self._bin_func.np_buffer_for_arg(3) + data_in = new_variable.reshape(self._bin_func.np_params[2].shape) - return _convert_ctype_to_python(self._ct_vo) + self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out) + return self._get_indexable(data_out) -class MechExecution(FuncExecution): - def execute(self, variable): - # Convert to 3d. We always assume that: - # a) the input is vector of input ports - # b) input ports take vector of projection outputs - # c) projection output is a vector (even 1 element vector) - new_var = np.atleast_3d(variable) - new_var.shape = (len(self._component.input_ports), 1, -1) - return super().execute(new_var) +class MechExecution(FuncExecution): + pass class CompExecution(CUDAExecution): @@ -385,10 +353,11 @@ def __init__(self, composition, execution_ids=[None], *, additional_tags=frozens self.__bin_func = None self.__bin_run_func = None self.__bin_run_multi_func = None - self.__frozen_vals = None + self.__frozen_values = None self.__tags = frozenset(additional_tags) - self.__conds = None + # Scheduling conditions, only used by "execute" + self.__conditions = None if len(execution_ids) > 1: self._ct_len = ctypes.c_int(len(execution_ids)) @@ -440,29 +409,37 @@ def _bin_func_multirun(self): def _set_bin_node(self, node): assert node in self._composition._all_nodes - wrapper = builder_context.LLVMBuilderContext.get_current().get_node_wrapper(self._composition, node) - self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj( - wrapper, tags=self.__tags.union({"node_wrapper"})) + node_assembly = builder_context.LLVMBuilderContext.get_current().get_node_assembly(self._composition, node) + self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj(node_assembly, + tags=self.__tags.union({"node_assembly"}), + numpy_args=(0, 1, 2, 3, 4)) @property def _conditions(self): - if self.__conds is None: + if self.__conditions is None: gen = helpers.ConditionGenerator(None, self._composition) + if len(self._execution_contexts) > 1: - cond_ctype = self._bin_func_multirun.byref_arg_types[4] * len(self._execution_contexts) - cond_initializer = (gen.get_condition_initializer() for _ in self._execution_contexts) + conditions_ctype = self._bin_func_multirun.byref_arg_types[4] * len(self._execution_contexts) + conditions_initializer = (gen.get_condition_initializer() for _ in self._execution_contexts) else: - cond_ctype = self._bin_func.byref_arg_types[4] - cond_initializer = gen.get_condition_initializer() + conditions_ctype = self._bin_func.byref_arg_types[4] + conditions_initializer = gen.get_condition_initializer() + + ct_conditions = conditions_ctype(*conditions_initializer) + np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=len(self._execution_contexts)) + + if len(self._execution_contexts) == 1: + np_conditions.shape = () + + self.__conditions = (ct_conditions, np_conditions) - c_conds = cond_ctype(*cond_initializer) - self.__conds = (c_conds, np.ctypeslib.as_array(c_conds)) if "stat" in self._debug_env: print("Instantiated condition struct ( size:" , - _pretty_size(ctypes.sizeof(cond_ctype)), ")", + _pretty_size(ctypes.sizeof(conditions_ctype)), ")", "for", self._composition.name) - return self.__conds + return self.__conditions @property def _param_struct(self): @@ -482,8 +459,8 @@ def _data_struct(self): def _data_struct(self, data_struct): self._data = data_struct - def _extract_node_struct(self, node, data): - # context structure consists of a list of node contexts, + def _extract_node_struct_from_ctype(self, node, data): + # state structure consists of a list of node states, # followed by a list of projection contexts; get the first one # parameter structure consists of a list of node parameters, # followed by a list of projection parameters; get the first one @@ -499,60 +476,90 @@ def _extract_node_struct(self, node, data): return _convert_ctype_to_python(res_struct) + def _extract_node_struct_from_numpy(self, node, data): + # state structure consists of a list of node states, + # followed by a list of projection contexts; get the first one + # parameter structure consists of a list of node parameters, + # followed by a list of projection parameters; get the first one + # output structure consists of a list of node outputs, + # followed by a list of nested data structures; get the first one + all_nodes = data[data.dtype.names[0]] + + # Get the index into the array of all nodes + index = self._composition._get_node_index(node) + node_struct = all_nodes[all_nodes.dtype.names[index]] + + # Return copies of the extracted functions to avoid corrupting the + # returned results in next execution + return node_struct.copy().tolist() if node_struct.shape == () else node_struct.copy() + def extract_node_struct(self, node, struct): if len(self._execution_contexts) > 1: - return [self._extract_node_struct(node, struct[i]) for i, _ in enumerate(self._execution_contexts)] + return [self._extract_node_struct_from_ctype(node, struct[0][i]) for i, _ in enumerate(self._execution_contexts)] else: - return self._extract_node_struct(node, struct) + return self._extract_node_struct_from_numpy(node, struct[1]) def extract_frozen_node_output(self, node): - return self.extract_node_struct(node, self.__frozen_vals) + return self.extract_node_struct(node, self.__frozen_values) def extract_node_output(self, node): - return self.extract_node_struct(node, self._data_struct[0]) + return self.extract_node_struct(node, self._data_struct) def extract_node_state(self, node): - return self.extract_node_struct(node, self._state_struct[0]) + return self.extract_node_struct(node, self._state_struct) def extract_node_params(self, node): - return self.extract_node_struct(node, self._param_struct[0]) + return self.extract_node_struct(node, self._param_struct) def insert_node_output(self, node, data): - my_field_name = self._data_struct[0]._fields_[0][0] - my_res_struct = getattr(self._data_struct[0], my_field_name) + # output structure consists of a list of node outputs, + # followed by a list of nested data structures; get the first one + all_nodes = self._data_struct[1][self._data_struct[1].dtype.names[0]] + + # Get the index into the array of all nodes index = self._composition._get_node_index(node) - node_field_name = my_res_struct._fields_[index][0] - setattr(my_res_struct, node_field_name, _tupleize(data)) + value = all_nodes[all_nodes.dtype.names[index]] + np.copyto(value, np.asarray(data, dtype=value.dtype)) def _get_input_struct(self, inputs): # Either node or composition execute. - # All execute functions expect inputs to be 3rd param. - c_input_type = self._bin_func.byref_arg_types[2] # Read provided input data and parse into an array (generator) if len(self._execution_contexts) > 1: assert len(self._execution_contexts) == len(inputs) - c_input_type = c_input_type * len(self._execution_contexts) + + # All execute functions expect inputs to be 3rd param. + ct_input_type = self._bin_func.byref_arg_types[2] * len(self._execution_contexts) + input_data = (([x] for x in self._composition._build_variable_for_input_CIM(inp)) for inp in inputs) + + ct_input = ct_input_type(*_tupleize(input_data)) + np_input = np.ctypeslib.as_array(ct_input) else: - input_data = ([x] for x in self._composition._build_variable_for_input_CIM(inputs)) + ct_input = None + data = self._composition._build_variable_for_input_CIM(inputs) + + np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base) + np_input = np_input.reshape(self._bin_func.np_params[2].shape) if "stat" in self._debug_env: - print("Input struct size:", _pretty_size(ctypes.sizeof(c_input_type)), - "for", self._composition.name) - c_input = c_input_type(*_tupleize(input_data)) - return c_input, np.ctypeslib.as_array(c_input) + print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name) + + return ct_input, np_input def freeze_values(self): - self.__frozen_vals = copy.deepcopy(self._data_struct[0]) + np_copy = self._data_struct[1].copy() - def execute_node(self, node, inputs=None, context=None): + self.__frozen_values = (None, np_copy) + + def execute_node(self, node, inputs=None): # We need to reconstruct the input dictionary here if it was not provided. # This happens during node execution of nested compositions. assert len(self._execution_contexts) == 1 + context = self._execution_contexts[0] + if inputs is None and node is self._composition.input_CIM: - if context is None: - context = self._execution_contexts[0] + port_inputs = {origin_port:[proj.parameters.value._get(context) for proj in p[0].path_afferents] for (origin_port, p) in self._composition.input_CIM_ports.items()} inputs = {} for p, v in port_inputs.items(): @@ -560,23 +567,33 @@ def execute_node(self, node, inputs=None, context=None): index = p.owner.input_ports.index(p) data[index] = v[0] + assert inputs is not None or node is not self._composition.input_CIM # Set bin node to make sure self._*struct works as expected self._set_bin_node(node) - if inputs is not None: - inputs = self._get_input_struct(inputs)[0] - assert inputs is not None or node is not self._composition.input_CIM + # Numpy doesn't allow to pass NULL to the called function. + # Create and pass a dummy buffer filled with NaN instead. + if inputs is not None: + inputs = self._get_input_struct(inputs)[1] + else: + inputs = self._bin_func.np_buffer_for_arg(2) - # Freeze output values if this is the first time we need them - if node is not self._composition.input_CIM and self.__frozen_vals is None: - self.freeze_values() + # Nodes other than input_CIM/parameter_CIM take inputs from projections + # and need frozen values available + if node is not self._composition.input_CIM and node is not self._composition.parameter_CIM: + assert self.__frozen_values is not None + data_in = self.__frozen_values[1] + else: + # The ndarray argument check doesn't allow None for null so just provide + # the same structure as outputs. + data_in = self._data_struct[1] - self._bin_func(self._state_struct[0], - self._param_struct[0], + self._bin_func(self._state_struct[1], + self._param_struct[1], inputs, - self.__frozen_vals, - self._data_struct[0]) + data_in, + self._data_struct[1]) if "comp_node_debug" in self._debug_env: print("RAN: {}. State: {}".format(node, self.extract_node_state(node))) @@ -589,7 +606,7 @@ def execute_node(self, node, inputs=None, context=None): def _bin_exec_func(self): if self.__bin_exec_func is None: self.__bin_exec_func = pnlvm.LLVMBinaryFunction.from_obj( - self._composition, tags=self.__tags) + self._composition, tags=self.__tags, numpy_args=(0, 1, 2, 3, 4)) return self.__bin_exec_func @@ -611,11 +628,11 @@ def execute(self, inputs): self._conditions[0], self._ct_len) else: - self._bin_exec_func(self._state_struct[0], - self._param_struct[0], - self._get_input_struct(inputs)[0], - self._data_struct[0], - self._conditions[0]) + self._bin_exec_func(self._state_struct[1], + self._param_struct[1], + self._get_input_struct(inputs)[1], + self._data_struct[1], + self._conditions[1]) def cuda_execute(self, inputs): # NOTE: Make sure that input struct generation is inlined. @@ -664,7 +681,7 @@ def _get_generator_run_input_struct(self, inputs, runs): def _bin_run_func(self): if self.__bin_run_func is None: self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj( - self._composition, tags=self.__tags.union({"run"})) + self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2)) return self.__bin_run_func @@ -712,9 +729,9 @@ def run(self, inputs, runs=0, num_input_sets=0): # This is only needed for non-generator inputs that are wrapped in an extra context dimension inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3]) - self._bin_run_func(self._state_struct[0], - self._param_struct[0], - self._data_struct[0], + self._bin_run_func(self._state_struct[1], + self._param_struct[1], + self._data_struct[1], inputs, outputs, runs_count, @@ -766,7 +783,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective" tags = {"evaluate", "alloc_range", eval_type} - bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags)) + bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6)) self.__bin_func = bin_func # There are 8 arguments to evaluate_alloc_range: @@ -776,9 +793,9 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results # Directly initialized structures assert ocm.agent_rep is self._composition - comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0) - comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1) - comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6) + comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0)[1] + comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)[1] + comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)[1] # Construct input variable, the 5th parameter of the evaluate function ct_inputs = self._get_run_input_struct(inputs, num_input_sets, 5) @@ -799,7 +816,6 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results "( evaluations:", num_evaluations, "element size:", ctypes.sizeof(out_el_ty), ")", "for", self._obj.name) - # return variable as numpy array. pycuda can use it directly return comp_params, comp_state, comp_data, ct_inputs, out_ty, ct_num_inputs def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False): @@ -808,11 +824,11 @@ def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:boo ct_results = out_ty() - cuda_args = (jit_engine.pycuda.driver.In(comp_params[1]), - jit_engine.pycuda.driver.InOut(comp_state[1]), + cuda_args = (jit_engine.pycuda.driver.In(comp_params), + jit_engine.pycuda.driver.InOut(comp_state), jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_results)), # results jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), # inputs - jit_engine.pycuda.driver.InOut(comp_data[1]), # composition data + jit_engine.pycuda.driver.InOut(comp_data), # composition data jit_engine.pycuda.driver.In(np.int32(num_input_sets)), # number of inputs ) @@ -833,19 +849,19 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b # Create input and result typed casts once, they are the same # for every submitted job. - input_param = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5]) - results_param = ctypes.cast(ct_results, self.__bin_func.c_func.argtypes[4]) + input_arg = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5]) + results_arg = ctypes.cast(ct_results, self.__bin_func.c_func.argtypes[4]) # There are 7 arguments to evaluate_alloc_range: # comp_param, comp_state, from, to, results, input, comp_data results = [ex.submit(self.__bin_func, - comp_params[0], - comp_state[0], + comp_params, + comp_state, int(i * evals_per_job), min((i + 1) * evals_per_job, num_evaluations), - results_param, - input_param, - comp_data[0], + results_arg, + input_arg, + comp_data, ct_num_inputs) for i in range(jobs)] diff --git a/psyneulink/core/llvm/helpers.py b/psyneulink/core/llvm/helpers.py index a7464fd7664..2eae0e69974 100644 --- a/psyneulink/core/llvm/helpers.py +++ b/psyneulink/core/llvm/helpers.py @@ -706,7 +706,7 @@ def generate_sched_condition(self, builder, condition, cond_ptr, node, # The first argument is the target node assert len(condition.args) == 1 target = is_finished_callbacks[condition.args[0]] - is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_wrapper"})) + is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_assembly"})) return builder.call(is_finished_f, target[1]) elif isinstance(condition, WhenFinishedAny): @@ -715,7 +715,7 @@ def generate_sched_condition(self, builder, condition, cond_ptr, node, run_cond = self.ctx.bool_ty(0) for node in condition.args: target = is_finished_callbacks[node] - is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_wrapper"})) + is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_assembly"})) node_is_finished = builder.call(is_finished_f, target[1]) run_cond = builder.or_(run_cond, node_is_finished) @@ -728,7 +728,7 @@ def generate_sched_condition(self, builder, condition, cond_ptr, node, run_cond = self.ctx.bool_ty(1) for node in condition.args: target = is_finished_callbacks[node] - is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_wrapper"})) + is_finished_f = self.ctx.import_llvm_function(target[0], tags=frozenset({"is_finished", "node_assembly"})) node_is_finished = builder.call(is_finished_f, target[1]) run_cond = builder.and_(run_cond, node_is_finished) diff --git a/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py b/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py index 4b197406217..48e5292a9d7 100644 --- a/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py +++ b/psyneulink/library/components/mechanisms/processing/transfer/lcamechanism.py @@ -392,7 +392,7 @@ class Parameters(RecurrentTransferMechanism.Parameters): matrix = Parameter( INVERSE_HOLLOW_MATRIX, - modulable=True, + modulable=False, getter=_recurrent_transfer_mechanism_matrix_getter, setter=_recurrent_transfer_mechanism_matrix_setter ) @@ -402,9 +402,9 @@ class Parameters(RecurrentTransferMechanism.Parameters): function_parameter_name='rate', aliases='leak' ) - auto = Parameter(0.0, modulable=True, aliases='self_excitation') - hetero = Parameter(-1.0, modulable=True) - competition = Parameter(1.0, modulable=True) + auto = Parameter(0.0, modulable=False, aliases='self_excitation') + hetero = Parameter(-1.0, modulable=False) + competition = Parameter(1.0, modulable=False) time_step_size = FunctionParameter(0.1, function_name='integrator_function') integrator_mode = Parameter(True, setter=_integrator_mode_setter, valid_types=bool) diff --git a/requirements.txt b/requirements.txt index dc485bfd0e2..8966c89ce8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,6 +17,6 @@ pillow<10.5.0 pint<0.22.0 protobuf<3.20.4 rich>=10.1, <10.13 -scipy<1.12 +scipy>=1.7.3, <1.15 toposort<1.11 torch>=1.10.0, <2.4.0; (platform_machine == 'AMD64' or platform_machine == 'x86_64' or platform_machine == 'arm64' or platform_machine == 'aarch64') and platform_python_implementation == 'CPython' and implementation_name == 'cpython' diff --git a/setup.cfg b/setup.cfg index ffc15d5cfb9..911094866c0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -71,6 +71,8 @@ filterwarnings = error:Creating an ndarray from ragged nested sequences \(which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes\) is deprecated.*:numpy.VisibleDeprecationWarning error:Invalid escape sequence error:the matrix subclass is not the recommended way to represent matrices or deal with linear algebra + error:Passing (type, 1) or '1type' as a synonym of type is deprecated + error:A builtin ctypes object gave a PEP3118:RuntimeWarning [pycodestyle] # for code explanation see https://pep8.readthedocs.io/en/latest/intro.html#error-codes diff --git a/tests/composition/test_control.py b/tests/composition/test_control.py index d390a7274f9..04a512b15ee 100644 --- a/tests/composition/test_control.py +++ b/tests/composition/test_control.py @@ -2711,7 +2711,8 @@ def test_modulation_of_random_state(self, comp_mode, num_generators): @pytest.mark.composition @pytest.mark.control class TestModelBasedOptimizationControlMechanisms_Execution: - def test_ocm_default_function(self): + @pytest.mark.parametrize("mode, ocm_mode", pytest.helpers.get_comp_and_ocm_execution_modes()) + def test_ocm_default_function(self, ocm_mode, mode): a = pnl.ProcessingMechanism() comp = pnl.Composition( controller_mode=pnl.BEFORE, @@ -2729,26 +2730,19 @@ def test_ocm_default_function(self): ), ) ) + comp.controller.comp_execution_mode = ocm_mode + assert type(comp.controller.function) == pnl.GridSearch - assert comp.run([1]) == [10] + + res = comp.run([1], execution_mode=mode) + np.testing.assert_array_equal(res, [[10]]) @pytest.mark.parametrize("nested", [True, False]) - @pytest.mark.parametrize("format", ["list", "tuple", "SampleIterator", "SampleIteratorArray", "SampleSpec", "ndArray"]) + @pytest.mark.parametrize("search_space", + [[1, 10], (1, 10), SampleIterator((1, 10)), SampleIterator([1, 10]), SampleSpec(1, 10, 9), np.array((1, 10))], + ids=["list", "tuple", "SampleIterator", "SampleIteratorArray", "SampleSpec", "ndArray"]) @pytest.mark.parametrize("mode, ocm_mode", pytest.helpers.get_comp_and_ocm_execution_modes()) - def test_ocm_searchspace_format_equivalence(self, format, nested, mode, ocm_mode): - - if format == "list": - search_space = [1, 10] - elif format == "tuple": - search_space = (1, 10) - elif format == "SampleIterator": - search_space = SampleIterator((1, 10)) - elif format == "SampleIteratorArray": - search_space = SampleIterator([1, 10]) - elif format == "SampleSpec": - search_space = SampleSpec(1, 10, 9) - elif format == "ndArray": - search_space = np.array((1, 10)) + def test_ocm_searchspace_format_equivalence(self, search_space, nested, mode, ocm_mode): if nested: search_space = [search_space] @@ -2772,7 +2766,9 @@ def test_ocm_searchspace_format_equivalence(self, format, nested, mode, ocm_mode comp.controller.comp_execution_mode = ocm_mode assert type(comp.controller.function) == pnl.GridSearch - assert comp.run([1], execution_mode=mode) == [[10]] + + res = comp.run([1], execution_mode=mode) + np.testing.assert_array_equal(res, [[10]]) def test_evc(self): # Mechanisms diff --git a/tests/composition/test_parameterestimationcomposition.py b/tests/composition/test_parameterestimationcomposition.py index bf3a8c3138b..12c39a64a3a 100644 --- a/tests/composition/test_parameterestimationcomposition.py +++ b/tests/composition/test_parameterestimationcomposition.py @@ -1,7 +1,10 @@ import numpy as np +import optuna import pandas as pd import pytest -import optuna +import scipy + +from packaging import version as pversion import psyneulink as pnl @@ -125,17 +128,32 @@ def test_pec_run_input_formats(inputs_dict, error_msg): pec.run(inputs=inputs_dict) +# SciPy changed their implementation of differential evolution and the way it selects +# samples to evaluate in 1.12 [0,1], and then again in 1.14 [2,3], leading to slightly +# different results +# +# [0] https://docs.scipy.org/doc/scipy/release/1.12.0-notes.html#scipy-optimize-improvements +# [1] https://github.com/scipy/scipy/pull/18496 +# [2] https://docs.scipy.org/doc/scipy/release/1.14.0-notes.html#scipy-optimize-improvements +# [3] https://github.com/scipy/scipy/pull/20677 +if pversion.parse(scipy.version.version) >= pversion.parse('1.14.0'): + expected_differential_evolution = [0.010113000942356953] +elif pversion.parse(scipy.version.version) >= pversion.parse('1.12.0'): + expected_differential_evolution = [0.010074123395259815] +else: + expected_differential_evolution = [0.010363518438648106] + @pytest.mark.composition @pytest.mark.parametrize( - "opt_method, result", + "opt_method, expected_result", [ - ("differential_evolution", [0.010363518438648106]), + ("differential_evolution", expected_differential_evolution), (optuna.samplers.RandomSampler(seed=0), [0.01]), (optuna.samplers.CmaEsSampler(seed=0), [0.01]), ], - ids=["differential_evolultion", "optuna_random_sampler", "optuna_cmaes_sampler"], + ids=["differential_evolution", "optuna_random_sampler", "optuna_cmaes_sampler"], ) -def test_parameter_optimization_ddm(func_mode, opt_method, result): +def test_parameter_optimization_ddm(func_mode, opt_method, expected_result): """Test parameter optimization of a DDM in integrator mode""" if func_mode == "Python": @@ -210,11 +228,9 @@ def reward_rate(sim_data): trial_inputs[0] = np.abs(trial_inputs[0]) trial_inputs[-1] = np.abs(trial_inputs[-1]) - inputs_dict = {decision: trial_inputs} - - ret = pec.run(inputs={comp: trial_inputs}) + pec.run(inputs={comp: trial_inputs}) - np.testing.assert_allclose(pec.optimized_parameter_values, result) + np.testing.assert_allclose(pec.optimized_parameter_values, expected_result) # func_mode is a hacky wa to get properly marked; Python, LLVM, and CUDA diff --git a/tests/llvm/test_builtins_intrinsics.py b/tests/llvm/test_builtins_intrinsics.py index fae99faa520..22cc3d2df8d 100644 --- a/tests/llvm/test_builtins_intrinsics.py +++ b/tests/llvm/test_builtins_intrinsics.py @@ -32,8 +32,10 @@ def test_builtin_op(benchmark, op, args, builtin, result, func_mode): if func_mode == 'Python': f = op + elif func_mode == 'LLVM': f = pnlvm.LLVMBinaryFunction.get(builtin) + elif func_mode == 'PTX': wrap_name = builtin + "_test_wrapper" with pnlvm.LLVMBuilderContext.get_current() as ctx: @@ -47,12 +49,18 @@ def test_builtin_op(benchmark, op, args, builtin, result, func_mode): builder.ret_void() bin_f = pnlvm.LLVMBinaryFunction.get(wrap_name) - dty = np.dtype(bin_f.byref_arg_types[0]) + + # The result argument is a pointer, use it to derive + # the right argument type + dty = bin_f.np_params[1].base + ptx_res = np.empty_like(result, dtype=dty) ptx_res_arg = pnlvm.jit_engine.pycuda.driver.Out(ptx_res) + def f(*a): bin_f.cuda_call(*(dty.type(p) for p in a), ptx_res_arg) return ptx_res + res = benchmark(f, *args) if pytest.helpers.llvm_current_fp_precision() == 'fp32': diff --git a/tests/llvm/test_builtins_matrix.py b/tests/llvm/test_builtins_matrix.py index e338f80bed3..1cad00e1565 100644 --- a/tests/llvm/test_builtins_matrix.py +++ b/tests/llvm/test_builtins_matrix.py @@ -39,9 +39,6 @@ def _get_const_dim_func(builtin, *dims): builtin = ctx.import_llvm_function(builtin) pointer_arg_types = [a for a in builtin.type.pointee.args if pnlvm.helpers.is_pointer(a)] - func_ty = ir.FunctionType(ir.VoidType(), pointer_arg_types) - - # Create square vector matrix multiply function = ir.Function(ctx.module, builtin.type.pointee, name=custom_name) const_dims = (ctx.int32_ty(d) for d in dims) @@ -65,6 +62,14 @@ def _get_const_dim_func(builtin, *dims): ], ids=["ADD", "SUB", "MUL", "ADDS", "MULS", "DOT", "TRANS DOT"]) @pytest.mark.parametrize("dims", [(DIM_X, DIM_Y), (0, 0)], ids=["VAR-DIM", "CONST-DIM"]) def test_matrix_op(benchmark, op, x, y, builtin, result, func_mode, dims): + + def _numpy_args(bin_f): + np_x = x.astype(bin_f.np_params[0]) + np_y = bin_f.np_params[1].type(y) if np.isscalar(y) else y.astype(bin_f.np_params[1]) + np_res = np.empty_like(result, dtype=bin_f.np_params[-1]) + + return np_x, np_y, np_res + if func_mode == 'Python': def ex(): return op(x, y) @@ -76,13 +81,7 @@ def ex(): func_name = builtin bin_f = pnlvm.LLVMBinaryFunction.get(func_name) - dty = np.dtype(bin_f.byref_arg_types[0]) - assert dty == np.dtype(bin_f.byref_arg_types[1]) - assert dty == np.dtype(bin_f.byref_arg_types[4]) - - lx = x.astype(dty) - ly = dty.type(y) if np.isscalar(y) else y.astype(dty) - lres = np.empty_like(result, dtype=dty) + lx, ly, lres = _numpy_args(bin_f) ct_x = lx.ctypes.data_as(bin_f.c_func.argtypes[0]) ct_y = ly if np.isscalar(ly) else ly.ctypes.data_as(bin_f.c_func.argtypes[1]) @@ -99,17 +98,12 @@ def ex(): func_name = builtin bin_f = pnlvm.LLVMBinaryFunction.get(func_name) - dty = np.dtype(bin_f.byref_arg_types[0]) - assert dty == np.dtype(bin_f.byref_arg_types[1]) - assert dty == np.dtype(bin_f.byref_arg_types[4]) - - lx = x.astype(dty) - ly = dty.type(y) if np.isscalar(y) else y.astype(dty) - lres = np.empty_like(result, dtype=dty) + lx, ly, lres = _numpy_args(bin_f) cuda_x = pnlvm.jit_engine.pycuda.driver.In(lx) cuda_y = ly if np.isscalar(ly) else pnlvm.jit_engine.pycuda.driver.In(ly) cuda_res = pnlvm.jit_engine.pycuda.driver.Out(lres) + def ex(): bin_f.cuda_call(cuda_x, cuda_y, np.int32(dims[0]), np.int32(dims[1]), cuda_res) return lres diff --git a/tests/llvm/test_builtins_mt_random.py b/tests/llvm/test_builtins_mt_random.py index 4d25e53c8cd..2ff7cff0ea2 100644 --- a/tests/llvm/test_builtins_mt_random.py +++ b/tests/llvm/test_builtins_mt_random.py @@ -15,35 +15,46 @@ def test_random_int(benchmark, mode): res = [] if mode == 'Python': state = random.Random(SEED) + def f(): return state.randrange(0xffffffff) + elif mode == 'numpy': # Numpy promotes elements to int64 state = np.random.RandomState([SEED]) + def f(): return state.randint(0xffffffff, dtype=np.int64) + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) + init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32') - out = ctypes.c_ulonglong() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return out.value + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) + init_fun.cuda_call(gpu_state, np.int32(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32') - out = np.asarray([0], dtype=np.uint64) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -61,35 +72,45 @@ def test_random_float(benchmark, mode): if mode == 'Python': # Python treats every seed as array state = random.Random(SEED) + def f(): return state.random() + elif mode == 'numpy': # numpy promotes elements to int64 state = np.random.RandomState([SEED]) + def f(): return state.random_sample() + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double') - out = gen_fun.byref_arg_types[1]() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return out.value + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) + init_fun.cuda_call(gpu_state, np.int32(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double') - out = np.asfarray([0.0], dtype=np.dtype(gen_fun.byref_arg_types[1])) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -107,30 +128,38 @@ def test_random_normal(benchmark, mode): if mode == 'numpy': # numpy promotes elements to int64 state = np.random.RandomState([SEED]) + def f(): return state.normal() + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal') - out = gen_fun.byref_arg_types[1]() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return out.value + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) + init_fun.cuda_call(gpu_state, np.int32(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal') - out = np.asfarray([0.0], dtype=np.dtype(gen_fun.byref_arg_types[1])) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -157,35 +186,44 @@ def test_random_binomial(benchmark, mode, n, p, exp): if mode == 'numpy': # numpy promotes elements to int64 state = np.random.RandomState([SEED]) + def f(): return state.binomial(n, p) + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial') - c_n = gen_fun.byref_arg_types[1](n) - c_p = gen_fun.byref_arg_types[2](p) - c_out = gen_fun.byref_arg_types[-1]() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial', numpy_args=(0, 1, 2, 3)) + n = np.asarray(n, dtype=gen_fun.np_params[1]) + p = np.asarray(p, dtype=gen_fun.np_params[2]) + def f(): - gen_fun(state, c_n, c_p, c_out) - return c_out.value + out = gen_fun.np_buffer_for_arg(1) + gen_fun(state, n, p, out) + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) + init_fun.cuda_call(gpu_state, np.int32(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial') - gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.array([n], dtype=np.dtype(gen_fun.byref_arg_types[1]))) - gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.array([p], dtype=np.dtype(gen_fun.byref_arg_types[2]))) - out = np.array([0.0], dtype=np.dtype(gen_fun.byref_arg_types[3])) + + gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1])) + gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2])) + + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) def f(): gen_fun.cuda_call(gpu_state, gpu_n, gpu_p, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) diff --git a/tests/llvm/test_builtins_philox_random.py b/tests/llvm/test_builtins_philox_random.py index 40fc1abc09a..0c6e289a700 100644 --- a/tests/llvm/test_builtins_philox_random.py +++ b/tests/llvm/test_builtins_philox_random.py @@ -26,27 +26,32 @@ def f(): return prng.integers(0xffffffffffffffff, dtype=np.uint64, endpoint=True) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, seed) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64') - out = ctypes.c_ulonglong() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return np.uint64(out.value) + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) + init_fun.cuda_call(gpu_state, np.int64(seed)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64') - out = np.asarray([0], dtype=np.uint64) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -64,33 +69,38 @@ def test_random_int32(benchmark, mode): res = [] if mode == 'numpy': state = np.random.Philox([SEED]) - prng = np.random.Generator(state) + prng = np.random.Generator(state)\ + def f(): # Get uint range [0, MAX] to avoid any intermediate caching of random bits return prng.integers(0xffffffff, dtype=np.uint32, endpoint=True) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32') - out = ctypes.c_uint() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return out.value + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) init_fun.cuda_call(gpu_state, np.int64(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32') - out = np.asarray([0], dtype=np.uint32) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -109,30 +119,36 @@ def test_random_double(benchmark, mode): if mode == 'numpy': state = np.random.Philox([SEED]) prng = np.random.Generator(state) + def f(): return prng.random(dtype=np.float64) + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double') - out = ctypes.c_double() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return out.value + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) init_fun.cuda_call(gpu_state, np.int64(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double') - out = np.asfarray([0.0], dtype=np.float64) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -150,30 +166,36 @@ def test_random_float(benchmark, mode): if mode == 'numpy': state = np.random.Philox([SEED]) prng = np.random.Generator(state) + def f(): return prng.random(dtype=np.float32) + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float') - out = ctypes.c_float() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return out.value + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) init_fun.cuda_call(gpu_state, np.int64(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float') - out = np.asfarray([0.0], dtype=np.float32) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -197,30 +219,36 @@ def test_random_normal(benchmark, mode, fp_type): if mode == 'numpy': state = np.random.Philox([SEED]) prng = np.random.Generator(state) + def f(): return prng.standard_normal(dtype=dtype) + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state = init_fun.byref_arg_types[0]() + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal') - out = gen_fun.byref_arg_types[1]() + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal', numpy_args=(0, 1)) + def f(): + out = gen_fun.np_buffer_for_arg(1) gen_fun(state, out) - return out.value + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) init_fun.cuda_call(gpu_state, np.int64(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal') - out = np.array([0.0], dtype=np.dtype(gen_fun.byref_arg_types[1])) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) + def f(): gen_fun.cuda_call(gpu_state, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) @@ -228,36 +256,38 @@ def f(): if fp_type is pnlvm.ir.DoubleType(): np.testing.assert_allclose(res[0:2], [-0.2059740286292238, -0.12884495093462758]) # 208 doesn't take the fast path but wraps around the main loop - np.testing.assert_allclose(res[207:211], [-0.768690647997579, 0.4301874289485477, - -0.7803640491708955, -1.146089287628737]) + np.testing.assert_allclose(res[207:211], + [-0.768690647997579, 0.4301874289485477, -0.7803640491708955, -1.146089287628737]) # 450 doesn't take the fast path or wrap around the main loop, # but takes the special condition at the end of the loop - np.testing.assert_allclose(res[449:453], [-0.7713655663874537, -0.5638348710823825, - -0.9415838853097869, 0.6212784278881248]) + np.testing.assert_allclose(res[449:453], + [-0.7713655663874537, -0.5638348710823825, -0.9415838853097869, 0.6212784278881248]) # 2013 takes the rare secondary loop and exists in the first iteration # taking the positive value - np.testing.assert_allclose(res[2011:2015], [0.4201922976982861, 2.7021541445373916, - 3.7809967764329375, 0.19919094793393655]) + np.testing.assert_allclose(res[2011:2015], + [0.4201922976982861, 2.7021541445373916, 3.7809967764329375, 0.19919094793393655]) # 5136 takes the rare secondary loop and exists in the first iteration # taking the negative value - np.testing.assert_allclose(res[5134:5138], [0.12317411414687844, -0.17846827974421134, - -3.6579887696059714, 0.2501530374224693]) + np.testing.assert_allclose(res[5134:5138], + [0.12317411414687844, -0.17846827974421134, -3.6579887696059714, 0.2501530374224693]) # 190855 takes the rare secondary loop and needs more than one iteration - np.testing.assert_allclose(res[190853:190857], [-0.26418319904491194, 0.35889007879353746, - -3.843811523424439, -1.5256469840469997]) + np.testing.assert_allclose(res[190853:190857], + [-0.26418319904491194, 0.35889007879353746, -3.843811523424439, -1.5256469840469997]) + elif fp_type is pnlvm.ir.FloatType(): # The indices are taken from above and don't have special meaning. np.testing.assert_allclose(res[0:2], [-0.24822916090488434, -0.02676701545715332]) - np.testing.assert_allclose(res[207:211], [-0.33086925745010376, -1.024695873260498, - -0.5162619352340698, -0.15033885836601257]) - np.testing.assert_allclose(res[449:453], [-0.2223609834909439, 0.16769859194755554, - -0.7806711196899414, 0.5867824554443359]) - np.testing.assert_allclose(res[2011:2015], [0.1979091316461563, -0.23467595875263214, - 1.1458240747451782, -1.0285860300064087]) - np.testing.assert_allclose(res[5134:5138], [-1.0523858070373535, -3.007537603378296, - -0.4331461489200592, -0.8841480612754822]) - np.testing.assert_allclose(res[190853:190857], [-0.8958197236061096, 0.10532315075397491, - 2.000257730484009, -1.129721999168396]) + np.testing.assert_allclose(res[207:211], + [-0.33086925745010376, -1.024695873260498, -0.5162619352340698, -0.15033885836601257]) + np.testing.assert_allclose(res[449:453], + [-0.2223609834909439, 0.16769859194755554, -0.7806711196899414, 0.5867824554443359]) + np.testing.assert_allclose(res[2011:2015], + [0.1979091316461563, -0.23467595875263214, 1.1458240747451782, -1.0285860300064087]) + np.testing.assert_allclose(res[5134:5138], + [-1.0523858070373535, -3.007537603378296, -0.4331461489200592, -0.8841480612754822]) + np.testing.assert_allclose(res[190853:190857], + [-0.8958197236061096, 0.10532315075397491, 2.000257730484009, -1.129721999168396]) + assert not any(np.isnan(res)), list(np.isnan(res)).index(True) benchmark(f) @@ -287,35 +317,40 @@ def test_random_binomial(benchmark, mode, fp_type, n, p, exp_64, exp_32): if mode == 'numpy': state = np.random.Philox([SEED]) prng = np.random.Generator(state) + def f(): return prng.binomial(n, p) + elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - c_state = init_fun.byref_arg_types[0]() - init_fun(c_state, SEED) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + state = init_fun.np_buffer_for_arg(0) + init_fun(state, SEED) + + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial', numpy_args=(0, 1, 2, 3)) + n = np.asarray(n, dtype=gen_fun.np_params[1]) + p = np.asarray(p, dtype=gen_fun.np_params[2]) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial') - c_n = gen_fun.byref_arg_types[1](n) - c_p = gen_fun.byref_arg_types[2](p) - c_out = gen_fun.byref_arg_types[-1]() def f(): - gen_fun(c_state, c_n, c_p, c_out) - return c_out.value + out = gen_fun.np_buffer_for_arg(1) + gen_fun(state, n, p, out) + return out + elif mode == 'PTX': init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') - state_size = ctypes.sizeof(init_fun.byref_arg_types[0]) + state_size = init_fun.np_buffer_for_arg(0).nbytes gpu_state = pnlvm.jit_engine.pycuda.driver.mem_alloc(state_size) init_fun.cuda_call(gpu_state, np.int64(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial') - gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.array([n], dtype=np.dtype(gen_fun.byref_arg_types[1]))) - gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.array([p], dtype=np.dtype(gen_fun.byref_arg_types[2]))) - out = np.array([0.0], dtype=np.dtype(gen_fun.byref_arg_types[3])) + gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1])) + gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2])) + out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) def f(): gen_fun.cuda_call(gpu_state, gpu_n, gpu_p, gpu_out) - return out[0] + return out.copy() + else: assert False, "Unknown mode: {}".format(mode) diff --git a/tests/llvm/test_builtins_vector.py b/tests/llvm/test_builtins_vector.py index d840b7acba8..999a7e42696 100644 --- a/tests/llvm/test_builtins_vector.py +++ b/tests/llvm/test_builtins_vector.py @@ -6,6 +6,7 @@ DIM_X=1500 + # These are just basic tests to check that vector indexing and operations # work correctly when compiled. The values don't matter much. # Might as well make them representable in fp32 for single precision testing. @@ -13,13 +14,11 @@ v = np.random.rand(DIM_X).astype(np.float32).astype(np.float64) scalar = np.random.rand() - add_res = np.add(u, v) sub_res = np.subtract(u, v) mul_res = np.multiply(u, v) smul_res = np.multiply(u, scalar) - @pytest.mark.benchmark(group="Hadamard") @pytest.mark.parametrize("op, v, builtin, result", [ (np.add, v, "__pnl_builtin_vec_add", add_res), @@ -28,18 +27,21 @@ (np.multiply, scalar, "__pnl_builtin_vec_scalar_mult", smul_res), ], ids=["ADD", "SUB", "MUL", "SMUL"]) def test_vector_op(benchmark, op, v, builtin, result, func_mode): + + def _numpy_args(bin_f): + np_u = u.astype(bin_f.np_params[0]) + np_v = bin_f.np_params[1].type(v) if np.isscalar(v) else v.astype(bin_f.np_params[1]) + np_res = np.empty_like(np_u) + + return np_u, np_v, np_res + if func_mode == 'Python': def ex(): return op(u, v) + elif func_mode == 'LLVM': bin_f = pnlvm.LLVMBinaryFunction.get(builtin) - dty = np.dtype(bin_f.byref_arg_types[0]) - assert dty == np.dtype(bin_f.byref_arg_types[1]) - assert dty == np.dtype(bin_f.byref_arg_types[3]) - - lu = u.astype(dty) - lv = dty.type(v) if np.isscalar(v) else v.astype(dty) - lres = np.empty_like(lu) + lu, lv, lres = _numpy_args(bin_f) ct_u = lu.ctypes.data_as(bin_f.c_func.argtypes[0]) ct_v = lv if np.isscalar(lv) else lv.ctypes.data_as(bin_f.c_func.argtypes[1]) @@ -51,17 +53,12 @@ def ex(): elif func_mode == 'PTX': bin_f = pnlvm.LLVMBinaryFunction.get(builtin) - dty = np.dtype(bin_f.byref_arg_types[0]) - assert dty == np.dtype(bin_f.byref_arg_types[1]) - assert dty == np.dtype(bin_f.byref_arg_types[3]) - - lu = u.astype(dty) - lv = dty.type(v) if np.isscalar(v) else v.astype(dty) - lres = np.empty_like(lu) + lu, lv, lres = _numpy_args(bin_f) cuda_u = pnlvm.jit_engine.pycuda.driver.In(lu) cuda_v = lv if np.isscalar(lv) else pnlvm.jit_engine.pycuda.driver.In(lv) cuda_res = pnlvm.jit_engine.pycuda.driver.Out(lres) + def ex(): bin_f.cuda_call(cuda_u, cuda_v, np.int32(DIM_X), cuda_res) return lres @@ -72,30 +69,35 @@ def ex(): @pytest.mark.benchmark(group="Sum") def test_vector_sum(benchmark, func_mode): + if func_mode == 'Python': def ex(): return np.sum(u) + elif func_mode == 'LLVM': - bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum") + bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,)) - lu = u.astype(np.dtype(bin_f.byref_arg_types[0])) - llvm_res = np.empty(1, dtype=lu.dtype) + np_u = u.astype(bin_f.np_params[0]) + np_res = bin_f.np_buffer_for_arg(2) - ct_u = lu.ctypes.data_as(bin_f.c_func.argtypes[0]) - ct_res = llvm_res.ctypes.data_as(bin_f.c_func.argtypes[2]) + ct_u = np_u.ctypes.data_as(bin_f.c_func.argtypes[0]) def ex(): - bin_f(ct_u, DIM_X, ct_res) - return llvm_res[0] + bin_f(ct_u, DIM_X, np_res) + return np_res + elif func_mode == 'PTX': - bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum") - lu = u.astype(np.dtype(bin_f.byref_arg_types[0])) - cuda_u = pnlvm.jit_engine.pycuda.driver.In(lu) - res = np.empty(1, dtype=lu.dtype) - cuda_res = pnlvm.jit_engine.pycuda.driver.Out(res) + bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,)) + + np_u = u.astype(bin_f.np_params[0]) + np_res = bin_f.np_buffer_for_arg(2) + + cuda_u = pnlvm.jit_engine.pycuda.driver.In(np_u) + cuda_res = pnlvm.jit_engine.pycuda.driver.Out(np_res) + def ex(): bin_f.cuda_call(cuda_u, np.int32(DIM_X), cuda_res) - return res[0] + return np_res res = benchmark(ex) - np.testing.assert_allclose(res, sum(u)) + np.testing.assert_allclose(res, np.sum(u)) diff --git a/tests/llvm/test_compile.py b/tests/llvm/test_compile.py index 406fc1e2430..c396cba594f 100644 --- a/tests/llvm/test_compile.py +++ b/tests/llvm/test_compile.py @@ -4,52 +4,48 @@ from psyneulink.core import llvm as pnlvm -ITERATIONS=100 DIM_X=1000 DIM_Y=2000 @pytest.mark.llvm def test_recompile(): # The original builtin mxv function - binf = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm') - dty = np.dtype(binf.byref_arg_types[0]) - assert dty == np.dtype(binf.byref_arg_types[1]) - assert dty == np.dtype(binf.byref_arg_types[4]) + bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm') - matrix = np.random.rand(DIM_X, DIM_Y).astype(dty) - vector = np.random.rand(DIM_X).astype(dty) - llvm_res = np.empty(DIM_Y, dtype=dty) + vector = np.random.rand(DIM_X).astype(bin_f.np_params[0].base) + matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_params[1].base) + llvm_res = np.empty(DIM_Y, dtype=bin_f.np_params[4].base) x, y = matrix.shape - ct_vec = vector.ctypes.data_as(binf.c_func.argtypes[0]) - ct_mat = matrix.ctypes.data_as(binf.c_func.argtypes[1]) + ct_vec = vector.ctypes.data_as(bin_f.c_func.argtypes[0]) + ct_mat = matrix.ctypes.data_as(bin_f.c_func.argtypes[1]) orig_res = np.empty_like(llvm_res) - ct_res = orig_res.ctypes.data_as(binf.c_func.argtypes[4]) + ct_res = orig_res.ctypes.data_as(bin_f.c_func.argtypes[4]) - binf.c_func(ct_vec, ct_mat, x, y, ct_res) + bin_f.c_func(ct_vec, ct_mat, x, y, ct_res) # Rebuild and try again # This is not a public API pnlvm._llvm_build() rebuild_res = np.empty_like(llvm_res) - ct_res = rebuild_res.ctypes.data_as(binf.c_func.argtypes[4]) + ct_res = rebuild_res.ctypes.data_as(bin_f.c_func.argtypes[4]) - binf.c_func(ct_vec, ct_mat, x, y, ct_res) + bin_f.c_func(ct_vec, ct_mat, x, y, ct_res) assert np.array_equal(orig_res, rebuild_res) # Get a new pointer - binf2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm') + bin_f2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm') new_res = np.empty_like(llvm_res) - ct_res = new_res.ctypes.data_as(binf2.c_func.argtypes[4]) + ct_res = new_res.ctypes.data_as(bin_f2.c_func.argtypes[4]) - binf2.c_func(ct_vec, ct_mat, x, y, ct_res) + bin_f2.c_func(ct_vec, ct_mat, x, y, ct_res) assert np.array_equal(rebuild_res, new_res) callable_res = np.empty_like(llvm_res) - ct_res = callable_res.ctypes.data_as(binf.c_func.argtypes[4]) + ct_res = callable_res.ctypes.data_as(bin_f.c_func.argtypes[4]) - binf2(ct_vec, ct_mat, x, y, ct_res) + bin_f2(ct_vec, ct_mat, x, y, ct_res) assert np.array_equal(new_res, callable_res) diff --git a/tests/llvm/test_helpers.py b/tests/llvm/test_helpers.py index f2e2cb141e6..e692bd62f37 100644 --- a/tests/llvm/test_helpers.py +++ b/tests/llvm/test_helpers.py @@ -1,6 +1,5 @@ import ctypes import ctypes.util -import copy import numpy as np import pytest import sys @@ -16,8 +15,7 @@ VECTOR = np.random.rand(DIM_X) @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) def test_helper_fclamp(mode): with pnlvm.LLVMBuilderContext.get_current() as ctx: @@ -46,12 +44,13 @@ def test_helper_fclamp(mode): ref = np.clip(VECTOR, TST_MIN, TST_MAX) bounds = np.asfarray([TST_MIN, TST_MAX]) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) - local_vec = copy.deepcopy(VECTOR) + local_vec = VECTOR.copy() + if mode == 'CPU': - ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0]) - ct_vec = local_vec.ctypes.data_as(ct_ty) - ct_bounds = bounds.ctypes.data_as(ct_ty) + ct_vec = local_vec.ctypes.data_as(bin_f.c_func.argtypes[0]) + ct_bounds = bounds.ctypes.data_as(bin_f.c_func.argtypes[2]) bin_f(ct_vec, DIM_X, ct_bounds) else: @@ -61,8 +60,7 @@ def test_helper_fclamp(mode): @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) def test_helper_fclamp_const(mode): with pnlvm.LLVMBuilderContext.get_current() as ctx: @@ -85,12 +83,12 @@ def test_helper_fclamp_const(mode): builder.ret_void() - local_vec = copy.deepcopy(VECTOR) + local_vec = VECTOR.copy() ref = np.clip(VECTOR, TST_MIN, TST_MAX) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) if mode == 'CPU': - ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0]) - ct_vec = local_vec.ctypes.data_as(ct_ty) + ct_vec = local_vec.ctypes.data_as(bin_f.c_func.argtypes[0]) bin_f(ct_vec, DIM_X) else: @@ -100,10 +98,8 @@ def test_helper_fclamp_const(mode): @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) -@pytest.mark.parametrize('rtol,atol', - [[0, 0], [None, None], [None, 100], [2, None]]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) +@pytest.mark.parametrize('rtol,atol', [[0, 0], [None, None], [None, 100], [2, None]]) @pytest.mark.parametrize('var1,var2', [[1, 1], [1, 100], [1,2], [-4,5], [0, -100], [-1,-2], [[1,1,1,-4,0,-1], [1,100,2,5,-100,-2]] @@ -148,18 +144,16 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type): bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) - dty = np.dtype(bin_f.byref_arg_types[0]) - vec1 = np.atleast_1d(np.asfarray(var1, dtype=dty)) - vec2 = np.atleast_1d(np.asfarray(var2, dtype=dty)) + vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_params[0].base)) + vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_params[1].base)) assert len(vec1) == len(vec2) res = np.empty_like(vec2) ref = np.isclose(vec1, vec2, **tolerance) if mode == 'CPU': - ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0]) - ct_vec1 = vec1.ctypes.data_as(ct_ty) - ct_vec2 = vec2.ctypes.data_as(ct_ty) - ct_res = res.ctypes.data_as(ct_ty) + ct_vec1 = vec1.ctypes.data_as(bin_f.c_func.argtypes[0]) + ct_vec2 = vec2.ctypes.data_as(bin_f.c_func.argtypes[1]) + ct_res = res.ctypes.data_as(bin_f.c_func.argtypes[2]) bin_f(ct_vec1, ct_vec2, ct_res, len(res)) else: @@ -169,10 +163,8 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type): @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) -@pytest.mark.parametrize('rtol,atol', - [[0, 0], [None, None], [None, 100], [2, None]]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) +@pytest.mark.parametrize('rtol,atol', [[0, 0], [None, None], [None, 100], [2, None]]) @pytest.mark.parametrize('var1,var2', [[1, 1], [1, 100], [1,2], [-4,5], [0, -100], [-1,-2], [[1,1,1,-4,0,-1], [1,100,2,5,-100,-2]] @@ -191,8 +183,7 @@ def test_helper_all_close(mode, var1, var2, atol, rtol): with pnlvm.LLVMBuilderContext.get_current() as ctx: arr_ptr_ty = ir.ArrayType(ir.DoubleType(), len(vec1)).as_pointer() - func_ty = ir.FunctionType(ir.VoidType(), [arr_ptr_ty, arr_ptr_ty, - ir.IntType(32).as_pointer()]) + func_ty = ir.FunctionType(ir.VoidType(), [arr_ptr_ty, arr_ptr_ty, ir.IntType(32).as_pointer()]) custom_name = ctx.get_unique_name("all_close") function = ir.Function(ctx.module, func_ty, name=custom_name) @@ -207,18 +198,14 @@ def test_helper_all_close(mode, var1, var2, atol, rtol): ref = np.allclose(vec1, vec2, **tolerance) - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) - if mode == 'CPU': - ct_ty = ctypes.POINTER(bin_f.byref_arg_types[0]) - ct_vec1 = vec1.ctypes.data_as(ct_ty) - ct_vec2 = vec2.ctypes.data_as(ct_ty) - res = ctypes.c_uint32() + res = np.array(5, dtype=np.uint32) + + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2)) - bin_f(ct_vec1, ct_vec2, ctypes.byref(res)) + if mode == 'CPU': + bin_f(vec1, vec2, res) else: - res = np.array([5], dtype=np.uint32) bin_f.cuda_wrap_call(vec1, vec2, res) - res = res[0] assert np.array_equal(res, ref) @@ -425,9 +412,9 @@ def test_helper_get_array_shape(self, ir_type, expected): def test_helper_array_from_shape(self, ir_type, shape): assert ir_type == pnlvm.helpers.array_from_shape(shape, self.DOUBLE_TYPE) + @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) @pytest.mark.parametrize('op,var,expected', [ (pnlvm.helpers.tanh, 1.0, 0.7615941559557649), (pnlvm.helpers.exp, 1.0, 2.718281828459045), @@ -436,8 +423,7 @@ def test_helper_array_from_shape(self, ir_type, shape): (pnlvm.helpers.log, 1.0, 0.0), (pnlvm.helpers.log1p, 1.0, 0.6931471805599453), ]) -@pytest.mark.parametrize('fp_type', [pnlvm.ir.DoubleType(), pnlvm.ir.FloatType()], - ids=lambda x: str(x)) +@pytest.mark.parametrize('fp_type', [pnlvm.ir.DoubleType(), pnlvm.ir.FloatType()], ids=str) def test_helper_numerical(mode, op, var, expected, fp_type): with pnlvm.LLVMBuilderContext(fp_type) as ctx: func_ty = ir.FunctionType(ir.VoidType(), [ctx.float_ty.as_pointer()]) @@ -454,20 +440,19 @@ def test_helper_numerical(mode, op, var, expected, fp_type): builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0,)) + + res = np.asfarray(var, dtype=bin_f.np_params[0]) + if mode == 'CPU': - res = bin_f.byref_arg_types[0](var) - bin_f(ctypes.byref(res)) - res = res.value + bin_f(res) else: - res = np.ctypeslib.as_array(bin_f.byref_arg_types[0](var)) bin_f.cuda_wrap_call(res) np.testing.assert_allclose(res, expected) @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) @pytest.mark.parametrize('var,expected', [ (np.asfarray([1,2,3]), np.asfarray([2,3,4])), (np.asfarray([[1,2],[3,4]]), np.asfarray([[2,3],[4,5]])), @@ -488,26 +473,20 @@ def test_helper_elementwise_op(mode, var, expected): lambda ctx, builder, x: builder.fadd(x.type(1.0), x), out) builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1)) - # convert input to the right type - dt = np.dtype(bin_f.byref_arg_types[0]) - dt = np.empty(1, dtype=dt).flatten().dtype - var = var.astype(dt) + vec = np.asfarray(var, dtype=bin_f.np_params[0].base) + res = bin_f.np_buffer_for_arg(1) if mode == 'CPU': - ct_vec = np.ctypeslib.as_ctypes(var) - res = bin_f.byref_arg_types[1]() - bin_f(ct_vec, ctypes.byref(res)) + bin_f(vec, res) else: - res = np.empty_like(var) - bin_f.cuda_wrap_call(var, res) + bin_f.cuda_wrap_call(vec, res) assert np.array_equal(res, expected) @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) @pytest.mark.parametrize('var1,var2,expected', [ (np.array([1.,2.,3.]), np.array([1.,2.,3.]), np.array([2.,4.,6.])), (np.array([1.,2.,3.]), np.array([0.,1.,2.]), np.array([1.,3.,5.])), @@ -537,24 +516,19 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected): a = builder.load(a_ptr) b = builder.load(b_ptr) builder.store(builder.fadd(a,b), o_ptr) + builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2)) - # convert input to the right type - dt = np.dtype(bin_f.byref_arg_types[0]) - dt = np.empty(1, dtype=dt).flatten().dtype - var1 = var1.astype(dt) - var2 = var2.astype(dt) + vec1 = np.asfarray(var1, dtype=bin_f.np_params[0].base) + vec2 = np.asfarray(var2, dtype=bin_f.np_params[0].base) + res = bin_f.np_buffer_for_arg(1) if mode == 'CPU': - ct_vec = np.ctypeslib.as_ctypes(var1) - ct_vec_2 = np.ctypeslib.as_ctypes(var2) - res = bin_f.byref_arg_types[2]() - bin_f(ct_vec, ct_vec_2, ctypes.byref(res)) + bin_f(vec1, vec2, res) else: - res = np.empty_like(var1) - bin_f.cuda_wrap_call(var1, var2, res) + bin_f.cuda_wrap_call(vec1, vec2, res) assert np.array_equal(res, expected) @@ -563,8 +537,7 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected): @pytest.mark.llvm -@pytest.mark.parametrize('mode', ['CPU', - pytest.param('PTX', marks=pytest.mark.cuda)]) +@pytest.mark.parametrize('mode', ['CPU', pytest.helpers.cuda_param('PTX')]) @pytest.mark.parametrize('t1', _fp_types) @pytest.mark.parametrize('t2', _fp_types) @pytest.mark.parametrize('val', [1.0, '-Inf', 'Inf', 'NaN', 16777216, 16777217, -1.0]) @@ -582,21 +555,18 @@ def test_helper_convert_fp_type(t1, t2, mode, val): builder.store(conv_x, y) builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1)) - # Convert type to numpy dtype - npt1, npt2 = (np.dtype(bin_f.byref_arg_types[x]) for x in (0, 1)) - npt1, npt2 = (np.float16().dtype if x == np.uint16 else x for x in (npt1, npt2)) + # Get the argument numpy dtype + np_dt1, np_dt2 = (np.dtype(bin_f.np_params[i]) for i in (0, 1)) # instantiate value, result and reference - x = np.asfarray(val, dtype=npt1) - y = np.asfarray(np.random.rand(), dtype=npt2) - ref = x.astype(npt2) + x = np.asfarray(val, dtype=np_dt1) + y = np.asfarray(0, dtype=np_dt2) + ref = x.astype(np_dt2) if mode == 'CPU': - ct_x = x.ctypes.data_as(bin_f.c_func.argtypes[0]) - ct_y = y.ctypes.data_as(bin_f.c_func.argtypes[1]) - bin_f(ct_x, ct_y) + bin_f(x, y) else: bin_f.cuda_wrap_call(x, y)