diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py index 9bec33b4ef..c97e72b391 100644 --- a/dace/codegen/dispatcher.py +++ b/dace/codegen/dispatcher.py @@ -20,7 +20,7 @@ @registry.extensible_enum class DefinedType(aenum.AutoNumberEnum): """ Data types for `DefinedMemlets`. - + :see: DefinedMemlets """ Pointer = () # Pointer @@ -159,6 +159,8 @@ class TargetDispatcher(object): _state_dispatchers: List[Tuple[Callable, target.TargetCodeGenerator]] _generic_state_dispatcher: Optional[target.TargetCodeGenerator] + _generic_reallocate_dispatchers: Dict[dtypes.StorageType, target.TargetCodeGenerator] + _declared_arrays: DefinedMemlets _defined_vars: DefinedMemlets @@ -181,6 +183,7 @@ def __init__(self, framecode): self._node_dispatchers = [] self._generic_node_dispatcher = None self._state_dispatchers = [] + self._generic_reallocate_dispatchers = {} self._generic_state_dispatcher = None self._declared_arrays = DefinedMemlets() @@ -189,7 +192,7 @@ def __init__(self, framecode): @property def declared_arrays(self) -> DefinedMemlets: """ Returns a list of declared variables. - + This is used for variables that must have their declaration and allocation separate. It includes all such variables that have been declared by the dispatcher. @@ -199,7 +202,7 @@ def declared_arrays(self) -> DefinedMemlets: @property def defined_vars(self) -> DefinedMemlets: """ Returns a list of defined variables. - + This includes all variables defined by the dispatcher. """ return self._defined_vars @@ -354,6 +357,15 @@ def register_copy_dispatcher(self, src_storage: dtypes.StorageType, dst_storage: self._copy_dispatchers[dispatcher].append((predicate, func)) + def register_reallocate_dispatcher(self, node_storage: dtypes.StorageType, + func: target.TargetCodeGenerator, + predicate: Optional[Callable] = None) -> None: + + if not isinstance(node_storage, dtypes.StorageType): raise TypeError(node_storage, dtypes.StorageType, isinstance(node_storage, dtypes.StorageType)) + dispatcher = node_storage + self._generic_reallocate_dispatchers[dispatcher] = func + return + def get_state_dispatcher(self, sdfg: SDFG, state: SDFGState) -> target.TargetCodeGenerator: # Check if the state satisfies any predicates that delegate to a # specific code generator @@ -594,6 +606,14 @@ def get_copy_dispatcher(self, src_node: Union[nodes.CodeNode, nodes.AccessNode], return target + def get_reallocate_dispatcher(self, node: Union[nodes.CodeNode, nodes.AccessNode], + edge: MultiConnectorEdge[Memlet], + sdfg: SDFG, state: SDFGState) -> Optional[target.TargetCodeGenerator]: + node_storage = sdfg.arrays[node.data].storage + target = self._generic_reallocate_dispatchers[node_storage] + return target + + def dispatch_copy(self, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, function_stream: CodeIOStream, output_stream: CodeIOStream) -> None: @@ -609,6 +629,17 @@ def dispatch_copy(self, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiC self._used_targets.add(target) target.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, function_stream, output_stream) + def dispatch_reallocate(self, src_node: nodes.Node, node: nodes.Node, edge: MultiConnectorEdge[Memlet], sdfg: SDFG, + cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, function_stream: CodeIOStream, + output_stream: CodeIOStream) -> None: + state = cfg.state(state_id) + target = self.get_reallocate_dispatcher(node, edge, sdfg, state) + assert target is not None + + self._used_targets.add(target) + target.reallocate(sdfg, cfg, dfg, state_id, src_node, node, edge, function_stream, output_stream) + + # Dispatches definition code for a memlet that is outgoing from a tasklet def dispatch_output_definition(self, src_node: nodes.Node, dst_node: nodes.Node, edge, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 89239abcb3..ed7c7bba49 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -9,6 +9,7 @@ import itertools import math import numbers +import re import sys import warnings @@ -231,6 +232,14 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher', elif memlet.data == dst_node.data: copy_shape, src_strides = reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + def replace_dace_defer_dim(string, arrname): + pattern = r"__dace_defer_dim(\d+)" + return re.sub(pattern, r"A_size[\1]", string) + + # TODO: do this better? + dst_expr = replace_dace_defer_dim(dst_expr, dst_node.data) if dst_expr is not None else None + src_expr = replace_dace_defer_dim(src_expr, src_node.data) if src_expr is not None else None + return copy_shape, src_strides, dst_strides, src_expr, dst_expr @@ -539,7 +548,8 @@ def ndcopy_to_strided_copy( return None -def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed_veclen=1, indices=None): +def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, + packed_veclen=1, indices=None, deferred_size_names=None): """ Creates a C++ expression that can be added to a pointer in order to offset it to the beginning of the given subset and offset. @@ -569,9 +579,62 @@ def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed if packed_veclen > 1: index /= packed_veclen - return sym2cpp(index) + if deferred_size_names is not None: + access_str_with_deferred_vars = sym2cpp(index) + def replace_pattern(match): + number = match.group(1) + return deferred_size_names[int(number)] + pattern = r'__dace_defer_dim(\d+)' + access_str = re.sub(pattern, replace_pattern, access_str_with_deferred_vars) + return access_str + else: + return sym2cpp(index) +def _get_deferred_size_names(desc, name): + if (desc.storage not in dtypes.REALLOCATABLE_STORAGES and + not desc.transient): + return None + def check_dace_defer(elements): + for elem in elements: + if "__dace_defer" in str(elem): + return True + return False + deferred_size_names = None + if check_dace_defer(desc.shape): + if desc.storage in dtypes.REALLOCATABLE_STORAGES: + deferred_size_names = [] + for i, elem in enumerate(desc.shape): + if "__dace_defer" in str(elem): + deferred_size_names.append(f"__{name}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]") + else: + deferred_size_names.append(elem) + return deferred_size_names if deferred_size_names is not None and len(deferred_size_names) > 0 else None + +def _get_realloc_dimensions(size_array_name:str, new_size_array_name:str, shape): + # Only consider the offsets with __dace_defer in original dim + mask_array = ["__dace_defer" in str(dim) for dim in shape] + + # In case the size does not only consist of a "__dace_defer" symbol but from an expression involving "__dace_defer" + # The size array is only updated with the symbol, and while calculating the expression, we only replace the __dace_defer_dim pattern + # With the corresponding access from the size array + size_assignment_strs = [] + new_size_strs = [] + old_size_strs = [] + for i, mask in enumerate(mask_array): + if mask: + new_size_str = sym2cpp(shape[i]) + pattern = r'__dace_defer_dim(\d+)' + new_size_strs.append(re.sub(pattern, lambda m: f'{new_size_array_name}[{m.group(1)}]', new_size_str)) + old_size_strs.append(re.sub(pattern, lambda m: f"{size_array_name}[{m.group(1)}]", new_size_str)) + size_assignment_strs.append( + f"{size_array_name}[{i}] = {new_size_array_name}[{i}];" + ) + else: + old_size_strs.append(sym2cpp(shape[i])) + new_size_strs.append(sym2cpp(shape[i])) + return size_assignment_strs, new_size_strs, old_size_strs + def cpp_array_expr(sdfg, memlet, with_brackets=True, @@ -586,8 +649,10 @@ def cpp_array_expr(sdfg, subset = memlet.subset if not use_other_subset else memlet.other_subset s = subset if relative_offset else subsets.Indices(offset) o = offset if relative_offset else None - desc = (sdfg.arrays[memlet.data] if referenced_array is None else referenced_array) - offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices) + desc : dace.Data = (sdfg.arrays[memlet.data] if referenced_array is None else referenced_array) + desc_name = memlet.data + deferred_size_names = _get_deferred_size_names(desc, desc_name) + offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices, deferred_size_names=deferred_size_names) # NOTE: Are there any cases where a mix of '.' and '->' is needed when traversing nested structs? # TODO: Study this when changing Structures to be (optionally?) non-pointers. @@ -763,7 +828,7 @@ def is_write_conflicted_with_reason(dfg, edge, datanode=None, sdfg_schedule=None Detects whether a write-conflict-resolving edge can be emitted without using atomics or critical sections, returning the node or SDFG that caused the decision. - + :return: None if the conflict is nonatomic, otherwise returns the scope entry node or SDFG that caused the decision to be made. """ diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index c7d05de5a3..68c85966b7 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -1,5 +1,6 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. from copy import deepcopy +import re from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView import functools @@ -122,6 +123,8 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): for src_storage, dst_storage in itertools.product(cpu_storage, cpu_storage): dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self) + dispatcher.register_reallocate_dispatcher(dtypes.StorageType.CPU_Heap, self) + @staticmethod def cmake_options(): options = [] @@ -336,6 +339,14 @@ def declare_array(self, declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node) self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef) + + # Size desc is defined only for transient arrays + if nodedesc.transient and nodedesc.storage == dtypes.StorageType.CPU_Heap and type(nodedesc) == data.Array: + size_desc_name = sdfg.arrays[name].size_desc_name + if size_desc_name is not None: + size_desc = sdfg.arrays[size_desc_name] + size_ctypedef = size_desc.dtype.ctype + self._dispatcher.declared_arrays.add(size_desc_name, DefinedType.Pointer, size_ctypedef) return elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal: # Define pointer once @@ -357,7 +368,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV allocate_nested_data: bool = True) -> None: alloc_name = cpp.ptr(node.data, nodedesc, sdfg, self._frame) name = alloc_name - tokens = node.data.split('.') top_desc = sdfg.arrays[tokens[0]] # NOTE: Assuming here that all Structure members share transient/storage/lifetime properties. @@ -366,9 +376,11 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV top_storage = top_desc.storage top_lifetime = top_desc.lifetime + if top_transient is False: return + # Check if array is already allocated if self._dispatcher.defined_vars.has(name): return @@ -393,6 +405,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV # Compute array size arrsize = nodedesc.total_size + deferred_allocation = any([s for s in nodedesc.shape if "__dace_defer" in str(s)]) arrsize_bytes = None if not isinstance(nodedesc.dtype, dtypes.opaque): arrsize_bytes = arrsize * nodedesc.dtype.bytes @@ -481,6 +494,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV ((symbolic.issymbolic(arrsize, sdfg.constants)) or (arrsize_bytes and ((arrsize_bytes > Config.get("compiler", "max_stack_array_size")) == True))))): + if nodedesc.storage == dtypes.StorageType.Register: if symbolic.issymbolic(arrsize, sdfg.constants): @@ -497,9 +511,25 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV if not declared: declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node) - allocation_stream.write( - "%s = new %s DACE_ALIGN(64)[%s];\n" % (alloc_name, nodedesc.dtype.ctype, cpp.sym2cpp(arrsize)), cfg, - state_id, node) + if deferred_allocation: + allocation_stream.write( + "%s = nullptr; // Deferred Allocation" % + (alloc_name,), + cfg, + state_id, + node + ) + else: + allocation_stream.write( + "%s = new %s DACE_ALIGN(64)[%s];\n" % + (alloc_name, + nodedesc.dtype.ctype, + cpp.sym2cpp(arrsize)), + cfg, + state_id, + node + ) + define_var(name, DefinedType.Pointer, ctypedef) if node.setzero: @@ -650,6 +680,41 @@ def copy_memory( callsite_stream, ) + def reallocate( + self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + src_node: nodes.AccessNode, + dst_node: nodes.AccessNode, + edge: Tuple[nodes.Node, Optional[str], nodes.Node, Optional[str], mmlt.Memlet], + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ): + function_stream.write( + "#include " + ) + data_name = dst_node.data + new_size_array_name = src_node.data + + desc = sdfg.arrays[data_name] + assert type(desc) == data.Array + size_array_name = desc.size_desc_name + + dtype = sdfg.arrays[data_name].dtype + + size_assignment_strs, new_size_strs, _ = cpp._get_realloc_dimensions( + size_array_name, new_size_array_name, desc.shape + ) + + for size_assignment in size_assignment_strs: + callsite_stream.write(size_assignment) + + size_str = " * ".join(new_size_strs) + callsite_stream.write( + f"{dst_node.data} = static_cast<{dtype} *>(std::realloc(static_cast({dst_node.data}), {size_str} * sizeof({dtype})));" + ) def _emit_copy( self, @@ -690,6 +755,7 @@ def _emit_copy( state_id, [src_node, dst_node], ) + return elif isinstance(src_node, nodes.Tasklet): # Copy out of tasklet @@ -797,6 +863,7 @@ def _emit_copy( state_dfg: SDFGState = cfg.nodes()[state_id] + copy_shape, src_strides, dst_strides, src_expr, dst_expr = cpp.memlet_copy_to_absolute_strides( self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._packed_types) @@ -1098,6 +1165,9 @@ def process_out_memlets(self, write_expr = f"*({ptr_str} + {array_expr}) = {in_local_name};" else: desc_dtype = desc.dtype + # If the storage type if CPU_Heap or GPU_Global then it might be requiring deferred allocation + # We can check if the array requires sepcial access using A_size[0] (CPU) or __A_dim0_size (GPU0) + # by going through the shape and checking for symbols starting with __dace_defer expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame) write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype) @@ -1106,7 +1176,11 @@ def process_out_memlets(self, # Dispatch array-to-array outgoing copies here elif isinstance(node, nodes.AccessNode): - if dst_node != node and not isinstance(dst_node, nodes.Tasklet): + if dst_node != node and not isinstance(dst_node, nodes.Tasklet) : + # If it is a size change, reallocate will be called + if edge.dst_conn is not None and edge.dst_conn == "_write_size": + continue + dispatcher.dispatch_copy( node, dst_node, @@ -2159,7 +2233,7 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub sdict = state_dfg.scope_dict() for edge in state_dfg.in_edges(node): - predecessor, _, _, _, memlet = edge + predecessor, _, dst, in_connector, memlet = edge if memlet.data is None: continue # If the edge has to be skipped @@ -2167,10 +2241,9 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub memlet_path = state_dfg.memlet_path(edge) if memlet_path[-1].dst == node: src_node = memlet_path[0].src - # Only generate code in case this is the innermost scope - # (copies are generated at the inner scope, where both arrays exist) - if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]): - self._dispatcher.dispatch_copy( + + if in_connector == "_write_size": + self._dispatcher.dispatch_reallocate( src_node, node, edge, @@ -2181,6 +2254,21 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub function_stream, callsite_stream, ) + else: + # Only generate code in case this is the innermost scope + # (copies are generated at the inner scope, where both arrays exist) + if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]): + self._dispatcher.dispatch_copy( + src_node, + node, + edge, + sdfg, + cfg, + dfg, + state_id, + function_stream, + callsite_stream, + ) # Process outgoing memlets (array-to-array write should be emitted # from the first leading edge out of the array) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index fd9840fdf0..60f5ccb64f 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1,6 +1,7 @@ # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. import ctypes import functools +import re import warnings from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union @@ -133,6 +134,9 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): illegal_copy, predicate=cpu_to_gpu_cpred) dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, st, sched_type, illegal_copy) + + dispatcher.register_reallocate_dispatcher(dtypes.StorageType.GPU_Global, self) + dispatcher.register_reallocate_dispatcher(dtypes.StorageType.CPU_Pinned, self) # End of illegal copies # End of dispatcher registration ###################################### @@ -297,7 +301,9 @@ def _compute_pool_release(self, top_sdfg: SDFG): # Add sink as terminator state for arr in unfreed: - self.pool_release[(sdfg, arr)] = (sink, set()) + if (sdfg.arrays[arr].storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared] + and arr not in sdfg.size_arrays()): # Do put size arrays to pool release + self.pool_release[(sdfg, arr)] = (sink, set()) # Generate final code def get_generated_codeobjects(self): @@ -339,7 +345,7 @@ def get_generated_codeobjects(self): cudaMemPool_t mempool; cudaDeviceGetDefaultMemPool(&mempool, 0); uint64_t threshold = {poolcfg if poolcfg != -1 else 'UINT64_MAX'}; - cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold); + cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold); ''' self._codeobject.code = """ @@ -548,6 +554,21 @@ def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi declaration_stream.write(result_decl.getvalue(), cfg, state_id, node) + def _alloc_gpu_global(self, node, nodedesc, result_alloc, dataname, arrsize_malloc): + if nodedesc.pool: + cudastream = getattr(node, '_cuda_stream', 'nullptr') + if cudastream != 'nullptr': + cudastream = f'__state->gpu_context->streams[{cudastream}]' + result_alloc.write( + f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n' + ) + self._emit_sync(result_alloc) + else: + # Strides are left to the user's discretion + result_alloc.write('DACE_GPU_CHECK(%sMalloc((void**)&%s, %s));\n' % + (self.backend, dataname, arrsize_malloc)) + + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: @@ -561,6 +582,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV # Check if array is already declared declared = False + size_declared = False try: self._dispatcher.declared_arrays.get(dataname) declared = True # Array was already declared in this or upper scopes @@ -585,28 +607,34 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV arrsize = nodedesc.total_size is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants) arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype) + ctypedef = '%s *' % nodedesc.dtype.ctype + deferred_allocation = any([s for s in nodedesc.shape if "__dace_defer" in str(s)]) + + # Different types of GPU arrays) + if deferred_allocation: + assert nodedesc.storage in dtypes.REALLOCATABLE_STORAGES - # Different types of GPU arrays - if nodedesc.storage == dtypes.StorageType.GPU_Global: if not declared: - result_decl.write('%s %s;\n' % (ctypedef, dataname)) + declaration_stream.write('%s %s;\n' % (ctypedef, dataname)) self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) - if nodedesc.pool: - cudastream = getattr(node, '_cuda_stream', 'nullptr') - if cudastream != 'nullptr': - cudastream = f'__state->gpu_context->streams[{cudastream}]' + if deferred_allocation: result_alloc.write( - f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n' + "%s = nullptr; // Deferred Allocation" % + (dataname,) ) - self._emit_sync(result_alloc) - else: - # Strides are left to the user's discretion - result_alloc.write('DACE_GPU_CHECK(%sMalloc((void**)&%s, %s));\n' % - (self.backend, dataname, arrsize_malloc)) + + elif nodedesc.storage == dtypes.StorageType.GPU_Global: + if not declared: + declaration_stream.write('%s %s;\n' % (ctypedef, dataname)) + self._alloc_gpu_global(node, nodedesc, result_alloc, dataname, arrsize_malloc) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef) + if node.setzero: + if deferred_allocation: + raise Exception("Deferred Allocation and setzero is not supported at the same time.") result_alloc.write('DACE_GPU_CHECK(%sMemset(%s, 0, %s));\n' % (self.backend, dataname, arrsize_malloc)) if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: result_alloc.write(f'{dataname} += {cpp.sym2cpp(nodedesc.start_offset)};\n') @@ -742,7 +770,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap if nodedesc.storage == dtypes.StorageType.GPU_Global: if not nodedesc.pool: # If pooled, will be freed somewhere else - callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), cfg, state_id, node) + callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));//a1\n' % (self.backend, dataname), cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.GPU_Shared or \ @@ -755,7 +783,7 @@ def _compute_cudastreams(self, sdfg: SDFG, default_stream=0, default_event=0): """ Annotates an SDFG (and all nested ones) to include a `_cuda_stream` field. This field is applied to all GPU maps, tasklets, and copies that can be executed in parallel. - + :param sdfg: The sdfg to modify. :param default_stream: The stream ID to start counting from (used in recursion to nested SDFGs). @@ -1263,8 +1291,7 @@ def generate_state(self, ptrname = cpp.ptr(name, desc, sd, self._frame) if isinstance(desc, dt.Array) and desc.start_offset != 0: ptrname = f'({ptrname} - {cpp.sym2cpp(desc.start_offset)})' - - callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', sd) + callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));//a2\n', sd) self._emit_sync(callsite_stream) to_remove.add((sd, name)) for sd, name in to_remove: @@ -1484,7 +1511,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub const_params = _get_const_params(dfg_scope) # make dynamic map inputs constant # TODO move this into _get_const_params(dfg_scope) - const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry)) + # Do not add src as const if the size is being read (src_conn is _read_size) + const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if e.src_conn is None or not (e.src_conn is not None and e.src_conn == "_read_size")) # Store init/exit code streams old_entry_stream = self.scope_entry_stream @@ -1505,6 +1533,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub # Refactor and fix when nested SDFGs are separate functions. self._dispatcher.defined_vars.enter_scope(scope_entry) prototype_kernel_args = {} + host_size_args = {} for aname, arg in kernel_args.items(): # `list` wrapper is used to modify kernel_args within the loop if aname in const_params: defined_type, ctype = None, None @@ -1558,8 +1587,18 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub prototype_kernel_args[aname] = arg + if aname in sdfg.arrays: + arr = sdfg.arrays[aname] + if arr.transient and arr.storage == dtypes.StorageType.GPU_Global and type(arr) == dt.Array and arr.size_desc_name is not None: + size_arr_name = data_desc.size_desc_name + if size_arr_name is not None: + size_arr = sdfg.arrays[size_arr_name] + host_size_args[size_arr_name] = size_arr + kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k) for k, v in prototype_kernel_args.items()] + host_size_args_typed = ['const ' + v.as_arg(name=k) + for k, v in host_size_args.items()] kernel_stream = CodeIOStream() self.generate_kernel_scope(sdfg, cfg, dfg_scope, state_id, scope_entry.map, kernel_name, grid_dims, block_dims, @@ -1585,9 +1624,26 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})' # Write kernel prototype + dyn_args = [] + dyn_args_typed = [] + for e in dace.sdfg.dynamic_map_inputs(state, scope_entry): + dyn_args.append(e.dst_conn) + dyn_args_typed.append(f"const {e.dst.in_connectors[e.dst_conn]} {e.dst_conn}") + # Size arrays + needed_size_scalars_declaration = [] + for size_desc_name, arg in host_size_args.items(): + arr_name = size_desc_name.removesuffix("_size") + for i in range(size_arr.shape[0]): + if f"__{arr_name}_dim{i}_size" not in dyn_args: + dyn_args.append(f"__{arr_name}_dim{i}_size") + size_desc = sdfg.arrays[size_desc_name] + dyn_args_typed.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size") + needed_size_scalars_declaration.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];") + #raise Exception(needed_size_scalars_declaration, dyn_args) + self._localcode.write( '__global__ void %s %s(%s) {\n' % - (launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed)), cfg, state_id, node) + (launch_bounds, kernel_name, ', '.join(kernel_args_typed + dyn_args_typed + extra_kernel_args_typed)), cfg, state_id, node) # Write constant expressions in GPU code self._frame.generate_constants(sdfg, self._localcode) @@ -1612,7 +1668,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub DACE_EXPORTED void __dace_runkernel_{fname}({fargs}); void __dace_runkernel_{fname}({fargs}) {{ -""".format(fname=kernel_name, fargs=', '.join(state_param + kernel_args_typed + extra_call_args_typed)), cfg, state_id, +""".format(fname=kernel_name, fargs=', '.join(state_param + kernel_args_typed + extra_call_args_typed + host_size_args_typed)), cfg, state_id, node) if is_persistent: @@ -1659,9 +1715,13 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub # make sure dynamic map inputs are properly handled for e in dace.sdfg.dynamic_map_inputs(state, scope_entry): + memlet_definition = self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]) self._localcode.write( - self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), + memlet_definition, cfg, state_id, scope_entry) + self._localcode.write("// Array sizes of arrays used are passed here if needed to the kernel even") + for decl in needed_size_scalars_declaration: + self._localcode.write(decl, cfg, state_id, scope_entry) gdims = 'dace_number_blocks, 1, 1' if is_persistent else ', '.join(_topy(grid_dims)) bdims = ', '.join(_topy(block_dims)) @@ -1696,7 +1756,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub void *{kname}_args[] = {{ {kargs} }}; gpuError_t __err = {backend}LaunchKernel((void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream});''' .format(kname=kernel_name, - kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + extra_kernel_args), + kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + + ['(void *)&' + arg for arg in dyn_args] + + extra_kernel_args), gdims=gdims, bdims=bdims, dynsmem=_topy(dynsmem_size), @@ -1714,7 +1776,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub # Add invocation to calling code (in another file) function_stream.write( 'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % - (kernel_name, ', '.join(state_param + kernel_args_typed + extra_call_args_typed)), cfg, state_id, + (kernel_name, ', '.join(state_param + kernel_args_typed + extra_call_args_typed + host_size_args_typed)), cfg, state_id, scope_entry) # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. @@ -1737,7 +1799,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub '__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(['__state'] + [cpp.ptr(aname, arg, sdfg, self._frame) - for aname, arg in kernel_args.items()] + extra_call_args)), cfg, state_id, + for aname, arg in kernel_args.items()] + extra_call_args + + list(host_size_args.keys()))), cfg, state_id, scope_entry) # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. @@ -2010,6 +2073,9 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S # handle dynamic map inputs for e in dace.sdfg.dynamic_map_inputs(cfg.node(state_id), dfg_scope.source_nodes()[0]): + # If src is a _read_size, it was handled before + if e.src_conn is not None and e.src_conn == "_read_size": + continue kernel_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), cfg, state_id, @@ -2718,6 +2784,86 @@ def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgra def make_ptr_vector_cast(self, *args, **kwargs): return cpp.make_ptr_vector_cast(*args, **kwargs) + def reallocate( + self, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg: StateSubgraphView, + state_id: int, + src_node: nodes.AccessNode, + dst_node: nodes.AccessNode, + edge: Tuple[nodes.Node, Optional[str], nodes.Node, Optional[str], dace.memlet.Memlet], + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + ): + function_stream.write( + "#include " + ) + data_name = dst_node.data + new_size_array_name = src_node.data + + desc = sdfg.arrays[data_name] + assert type(desc) == dt.Array + size_array_name = desc.size_desc_name + + dtype = sdfg.arrays[data_name].dtype + + size_assignment_strs, new_size_strs, old_size_strs = cpp._get_realloc_dimensions( + size_array_name, new_size_array_name, desc.shape + ) + + + # Call realloc only after no __dace_defer is left in size_array (must be true) + # Save new and old sizes before registering them, because we need both to compute the bound of the new array + old_size_str = " * ".join(old_size_strs) + old_size_str += f" * sizeof({dtype.ctype})" + new_size_str = " * ".join(new_size_strs) + new_size_str += f" * sizeof({dtype.ctype})" + tmp_storage_name = "__tmp_realloc_move_storage" + + callsite_stream.write(f"if ({dst_node.data} == nullptr) {{", cfg, state_id, dst_node.guid) + if desc.storage == dtypes.StorageType.GPU_Global: + self._alloc_gpu_global(dst_node, desc, callsite_stream, data_name, new_size_str) + else: + assert desc.storage == dtypes.StorageType.CPU_Pinned + callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast(&{data_name}), {new_size_str}));", cfg, state_id, dst_node.guid) + callsite_stream.write("} else {\n", cfg, state_id, dst_node.guid) + callsite_stream.write(f"{dtype}* {tmp_storage_name};") + if desc.storage == dtypes.StorageType.GPU_Global: + self._alloc_gpu_global(None, desc, callsite_stream, tmp_storage_name, new_size_str) + else: + assert desc.storage == dtypes.StorageType.CPU_Pinned + callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast(&{tmp_storage_name}), {new_size_str}));", cfg, state_id, dst_node.guid) + + s = "" + copy_size_str = f"Min({old_size_str}, {new_size_str})" + if desc.storage == dtypes.StorageType.GPU_Global: + if not desc.pool: # If pooled, will be freed somewhere else + s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast({tmp_storage_name}), static_cast({data_name}), {copy_size_str}, cudaMemcpyDeviceToDevice));\n" + s += f"DACE_GPU_CHECK({self.backend}Free({data_name}));\n" + s += f"{data_name} = {tmp_storage_name};\n" + else: + cudastream = getattr(dst_node, '_cuda_stream', 'nullptr') + if cudastream != 'nullptr': + cudastream = f'__state->gpu_context->streams[{cudastream}]' + s += f'DACE_GPU_CHECK({self.backend}MallocAsync(reinterpret_cast(&{tmp_storage_name}), {new_size_str}, {cudastream}));\n' + s += f"DACE_GPU_CHECK({self.backend}MemcpyAsync(static_cast({tmp_storage_name}), static_cast({data_name}), {copy_size_str}, {cudastream}), cudaMemcpyDeviceToDevice));\n" + s += f"DACE_GPU_CHECK({self.backend}FreeAsync({data_name}, {cudastream}));\n" + callsite_stream.write(s) + self._emit_sync(callsite_stream) + callsite_stream.write(f"{data_name} = {tmp_storage_name};\n") + s = "" + elif desc.storage == dtypes.StorageType.CPU_Pinned: + s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast({tmp_storage_name}), static_cast({data_name}), {copy_size_str}, cudaMemcpyHostToHost));\n" + s += f"DACE_GPU_CHECK({self.backend}FreeHost({data_name}));\n" + s += f"{data_name} = {tmp_storage_name};\n" + else: + raise Exception("Realloc in CUDA, storage type must be CPU_Pinned or GPU_Global") + s += "}\n" + callsite_stream.write(s) + + for size_assignment in size_assignment_strs: + callsite_stream.write(size_assignment) ######################################################################## ######################################################################## diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py index 197f515e52..3e78681387 100644 --- a/dace/codegen/targets/fpga.py +++ b/dace/codegen/targets/fpga.py @@ -2371,6 +2371,7 @@ def generate_host_function_boilerplate(self, sdfg, cfg, state, nested_global_tra # Any extra transients stored in global memory on the FPGA must now be # allocated and passed to the kernel + for arr_node in nested_global_transients: self._dispatcher.dispatch_allocate(sdfg, cfg, state, None, arr_node, arr_node.desc(sdfg), None, host_code_stream) diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 238b0f7a22..4d56883c58 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -23,7 +23,7 @@ from dace.sdfg.analysis import cfg as cfg_analysis from dace.sdfg.state import ConditionalBlock, ControlFlowBlock, ControlFlowRegion, LoopRegion from dace.transformation.passes.analysis import StateReachability, loop_analysis - +from dace.codegen.targets import cpp def _get_or_eval_sdfg_first_arg(func, sdfg): if callable(func): @@ -838,6 +838,7 @@ def allocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Un self._dispatcher.dispatch_allocate(tsdfg, cfg if state is None else state.parent_graph, state, state_id, node, desc, function_stream, callsite_stream, declare, allocate) + def deallocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Union[nodes.EntryNode, SDFGState, SDFG], function_stream: CodeIOStream, callsite_stream: CodeIOStream): @@ -912,6 +913,7 @@ def generate_code(self, # Allocate inter-state variables global_symbols = copy.deepcopy(sdfg.symbols) global_symbols.update({aname: arr.dtype for aname, arr in sdfg.arrays.items()}) + interstate_symbols = {} for cfr in sdfg.all_control_flow_regions(): for e in cfr.dfs_edges(cfr.start_block): @@ -962,6 +964,28 @@ def generate_code(self, self.dispatcher.defined_vars.add(isvarName, disp.DefinedType.Scalar, isvarType.ctype) callsite_stream.write('\n', sdfg) + # After the symbols + # Allocate size arrays (always check as name and array changes affect size descriptor names) + # Only allocate arrays that really require deferred allocation (symbol has __dace_defer) + # Reshaping these arrays are not allowed + size_arrays = sdfg.size_arrays() + for size_desc_name in size_arrays: + size_nodedesc = sdfg.arrays[size_desc_name] + assert ("__return" not in size_desc_name) + ctypedef = size_nodedesc.dtype.ctype + array = [v for v in sdfg.arrays.values() if type(v) == data.Array and v.size_desc_name is not None and v.size_desc_name == size_desc_name] + assert len(array) <= 1 + if len(array) == 1: + array = array[0] + if type(array) == data.Array and array.is_deferred_array: + # 0 is a placeholder value, it is not important what the value is + dimensions = ["0" if "__dace_defer" in cpp.sym2cpp(dim) else cpp.sym2cpp(dim) for dim in array.shape] + size_str = ",".join(dimensions) + assert len(size_nodedesc.shape) == 1 + alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}] = {{{size_str}}};\n' + callsite_stream.write(alloc_str) + self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef, allow_shadowing=True) + ####################################################################### # Generate actual program body diff --git a/dace/data.py b/dace/data.py index 9749411fe6..4082b97fd4 100644 --- a/dace/data.py +++ b/dace/data.py @@ -27,7 +27,7 @@ def create_datadescriptor(obj, no_custom_desc=False): """ Creates a data descriptor from various types of objects. - + :see: dace.data.Data """ from dace import dtypes # Avoiding import loops @@ -184,6 +184,7 @@ def _transient_setter(self, value): location = DictProperty(key_type=str, value_type=str, desc='Full storage location identifier (e.g., rank, GPU ID)') debuginfo = DebugInfoProperty(allow_none=True) + def __init__(self, dtype, shape, transient, storage, location, lifetime, debuginfo): self.dtype = dtype self.shape = shape @@ -548,7 +549,7 @@ class TensorIndex(ABC): def iteration_type(self) -> TensorIterationTypes: """ Iteration capability supported by this index. - + See TensorIterationTypes for reference. """ pass @@ -566,7 +567,7 @@ def locate(self) -> bool: def assembly(self) -> TensorAssemblyType: """ What assembly type is supported by the index. - + See TensorAssemblyType for reference. """ pass @@ -576,7 +577,7 @@ def assembly(self) -> TensorAssemblyType: def full(self) -> bool: """ True if the level is full, False otw. - + A level is considered full if it encompasses all valid coordinates along the corresponding tensor dimension. """ @@ -587,7 +588,7 @@ def full(self) -> bool: def ordered(self) -> bool: """ True if the level is ordered, False otw. - + A level is ordered when all coordinates that share the same ancestor are ordered by increasing value (e.g. in typical CSR). """ @@ -598,7 +599,7 @@ def ordered(self) -> bool: def unique(self) -> bool: """ True if coordinate in the level are unique, False otw. - + A level is considered unique if no collection of coordinates that share the same ancestor contains duplicates. In CSR this is True, in COO it is not. @@ -610,7 +611,7 @@ def unique(self) -> bool: def branchless(self) -> bool: """ True if the level doesn't branch, false otw. - + A level is considered branchless if no coordinate has a sibling (another coordinate with same ancestor) and all coordinates in parent level have a child. In other words if there is a bijection between the coordinates @@ -624,7 +625,7 @@ def branchless(self) -> bool: def compact(self) -> bool: """ True if the level is compact, false otw. - + A level is compact if no two coordinates are separated by an unlabled node that does not encode a coordinate. An example of a compact level can be found in CSR, while the DIA formats range and offset levels are @@ -637,7 +638,7 @@ def compact(self) -> bool: def fields(self, lvl: int, dummy_symbol: symbolic.SymExpr) -> Dict[str, Data]: """ Generates the fields needed for the index. - + :return: a Dict of fields that need to be present in the struct """ pass @@ -675,7 +676,7 @@ def from_json(cls, json_obj, context=None): class TensorIndexDense(TensorIndex): """ Dense tensor index. - + Levels of this type encode the the coordinate in the interval [0, N), where N is the size of the corresponding dimension. This level doesn't need any index structure beyond the corresponding dimension size. @@ -742,9 +743,9 @@ def __repr__(self) -> str: class TensorIndexCompressed(TensorIndex): """ Tensor level that stores coordinates in segmented array. - + Levels of this type are compressed using a segented array. The pos array - holds the start and end positions of the segment in the crd (coordinate) + holds the start and end positions of the segment in the crd (coordinate) array that holds the child coordinates corresponding the parent. """ @@ -816,7 +817,7 @@ def __repr__(self) -> str: class TensorIndexSingleton(TensorIndex): """ Tensor index that encodes a single coordinate per parent coordinate. - + Levels of this type hold exactly one coordinate for every coordinate in the parent level. An example can be seen in the COO format, where every coordinate but the first is encoded in this manner. @@ -889,7 +890,7 @@ def __repr__(self) -> str: class TensorIndexRange(TensorIndex): """ Tensor index that encodes a interval of coordinates for every parent. - + The interval is computed from an offset for each parent together with the tensor dimension size of this level (M) and the parent level (N) parents corresponding tensor. Given the parent coordinate i, the level encodes the @@ -959,7 +960,7 @@ def __repr__(self) -> str: class TensorIndexOffset(TensorIndex): """ Tensor index that encodes the next coordinates as offset from parent. - + Given a parent coordinate i and an offset index k, the level encodes the coordinate j = i + offset[k]. """ @@ -1027,7 +1028,7 @@ def __repr__(self) -> str: class Tensor(Structure): """ Abstraction for Tensor storage format. - + This abstraction is based on [https://doi.org/10.1145/3276493]. """ @@ -1054,7 +1055,7 @@ def __init__(self, Below are examples of common matrix storage formats: .. code-block:: python - + M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz')) csr = dace.data.Tensor( @@ -1130,7 +1131,7 @@ def __init__(self, :param value_type: data type of the explicitly stored values. :param tensor_shape: logical shape of tensor (#rows, #cols, etc...) - :param indices: + :param indices: a list of tuples, each tuple represents a level in the tensor storage hirachy, specifying the levels tensor index type, and the corresponding dimension this level encodes (as index of the @@ -1305,12 +1306,12 @@ class Array(Data): how it should behave. The array definition is flexible in terms of data allocation, it allows arbitrary multidimensional, potentially - symbolic shapes (e.g., an array with size ``N+1 x M`` will have ``shape=(N+1, M)``), of arbitrary data + symbolic shapes (e.g., an array with size ``N+1 x M`` will have ``shape=(N+1, M)``), of arbitrary data typeclasses (``dtype``). The physical data layout of the array is controlled by several properties: * The ``strides`` property determines the ordering and layout of the dimensions --- it specifies how many elements in memory are skipped whenever one element in that dimension is advanced. For example, the contiguous - dimension always has a stride of ``1``; a C-style MxN array will have strides ``(N, 1)``, whereas a + dimension always has a stride of ``1``; a C-style MxN array will have strides ``(N, 1)``, whereas a FORTRAN-style array of the same size will have ``(1, M)``. Strides can be larger than the shape, which allows post-padding of the contents of each dimension. * The ``start_offset`` property is a number of elements to pad the beginning of the memory buffer with. This is @@ -1327,7 +1328,7 @@ class Array(Data): to zero. To summarize with an example, a two-dimensional array with pre- and post-padding looks as follows: - + .. code-block:: text [xxx][ |xx] @@ -1345,7 +1346,7 @@ class Array(Data): Notice that the last padded row does not appear in strides, but is a consequence of ``total_size`` being larger. - + Apart from memory layout, other properties of ``Array`` help the data-centric transformation infrastructure make decisions about the array. ``allow_conflicts`` states that warnings should not be printed if potential conflicted @@ -1385,6 +1386,10 @@ class Array(Data): 'it is inferred by other properties and the OptionalArrayInference pass.') pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred') + size_desc_name = Property(dtype=str, default=None, allow_none=True, desc="Name of the size desc, not None only for reallocatable storage types that are also transient") + is_size_array = Property(dtype=bool, default=False, desc='Special array that is used to track the size of an another array') + is_deferred_array = Property(dtype=bool, default=False, desc='Array that requires deferred allocation') + def __init__(self, dtype, shape, @@ -1401,13 +1406,18 @@ def __init__(self, total_size=None, start_offset=None, optional=None, - pool=False): + pool=False, + size_desc_name=None): super(Array, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo) self.allow_conflicts = allow_conflicts self.may_alias = may_alias self.alignment = alignment + self.size_desc_name = size_desc_name + self.is_size_array = False + if size_desc_name is not None: + assert self.is_size_array is False if start_offset is not None: self.start_offset = start_offset @@ -1431,6 +1441,9 @@ def __init__(self, self.offset = cp.copy(offset) else: self.offset = [0] * len(shape) + + self.is_deferred_array = any(["__dace_defer" in str(dim) for dim in self.shape]) + self.validate() def __repr__(self): @@ -1570,7 +1583,7 @@ def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]: def free_symbols(self): return self.used_symbols(all_symbols=True) - def _set_shape_dependent_properties(self, shape, strides, total_size, offset): + def _set_shape_dependent_properties(self, shape, strides, total_size, offset, sdfg=None): """ Used to set properties which depend on the shape of the array either to their default value, which depends on the shape, or @@ -1595,18 +1608,28 @@ def _set_shape_dependent_properties(self, shape, strides, total_size, offset): else: self.offset = [0] * len(shape) + if self.is_deferred_array and sdfg is not None: + sdfg.arrays[self.size_desc_name].set_shape(new_shape=(len(shape),)) + def set_shape( self, new_shape, strides=None, total_size=None, offset=None, + sdfg=None, ): """ Updates the shape of an array. """ self.shape = new_shape - self._set_shape_dependent_properties(new_shape, strides, total_size, offset) + self._set_shape_dependent_properties( + shape=new_shape, + strides=strides, + total_size=total_size, + offset=offset, + sdfg=sdfg + ) self.validate() @@ -1827,7 +1850,7 @@ def from_json(cls, json_obj, context=None): class View: - """ + """ Data descriptor that acts as a static reference (or view) of another data container. Can be used to reshape or reinterpret existing data without copying it. @@ -1839,9 +1862,9 @@ class View: node, and the other side (out/in) has a different number of edges. * If there is one incoming and one outgoing edge, and one leads to a code node, the one that leads to an access node is the viewed data. - * If both sides lead to access nodes, if one memlet's data points to the + * If both sides lead to access nodes, if one memlet's data points to the view it cannot point to the viewed node. - * If both memlets' data are the respective access nodes, the access + * If both memlets' data are the respective access nodes, the access node at the highest scope is the one that is viewed. * If both access nodes reside in the same scope, the input data is viewed. @@ -1910,11 +1933,11 @@ def view(viewed_container: Data, debuginfo=None): class Reference: - """ + """ Data descriptor that acts as a dynamic reference of another data descriptor. It can be used just like a regular data descriptor, except that it could be set to an arbitrary container (or subset thereof) at runtime. To set a reference, connect another access node to it and use the "set" connector. - + In order to enable data-centric analysis and optimizations, avoid using References as much as possible. """ @@ -1965,7 +1988,7 @@ def view(viewed_container: Data, debuginfo=None): @make_properties class ArrayView(Array, View): - """ + """ Data descriptor that acts as a static reference (or view) of another array. Can be used to reshape or reinterpret existing data without copying it. @@ -1989,7 +2012,7 @@ def as_array(self): @make_properties class StructureView(Structure, View): - """ + """ Data descriptor that acts as a view of another structure. """ @@ -2020,7 +2043,7 @@ def as_structure(self): @make_properties class ContainerView(ContainerArray, View): - """ + """ Data descriptor that acts as a view of another container array. Can be used to access nested container types without a copy. """ @@ -2062,9 +2085,9 @@ def as_array(self): @make_properties class ArrayReference(Array, Reference): - """ + """ Data descriptor that acts as a dynamic reference of another array. See ``Reference`` for more information. - + In order to enable data-centric analysis and optimizations, avoid using References as much as possible. """ @@ -2084,9 +2107,9 @@ def as_array(self): @make_properties class StructureReference(Structure, Reference): - """ + """ Data descriptor that acts as a dynamic reference of another Structure. See ``Reference`` for more information. - + In order to enable data-centric analysis and optimizations, avoid using References as much as possible. """ @@ -2109,10 +2132,10 @@ def as_structure(self): @make_properties class ContainerArrayReference(ContainerArray, Reference): - """ + """ Data descriptor that acts as a dynamic reference of another data container array. See ``Reference`` for more information. - + In order to enable data-centric analysis and optimizations, avoid using References as much as possible. """ diff --git a/dace/dtypes.py b/dace/dtypes.py index d7076dc987..2fc663a45d 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -103,6 +103,12 @@ class ScheduleType(aenum.AutoNumberEnum): StorageType.FPGA_ShiftRegister, ] +REALLOCATABLE_STORAGES = [ + StorageType.CPU_Heap, + StorageType.CPU_Pinned, + StorageType.GPU_Global, +] + @undefined_safe_enum class ReductionType(aenum.AutoNumberEnum): diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index 57db9d7089..7a52db3dcf 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -189,13 +189,13 @@ def parse_dace_program(name: str, Parses a ``@dace.program`` function into an SDFG. :param src_ast: The AST of the Python program to parse. - :param visitor: A ProgramVisitor object returned from + :param visitor: A ProgramVisitor object returned from ``preprocess_dace_program``. :param closure: An object that contains the @dace.program closure. :param simplify: If True, simplification pass will be performed. :param save: If True, saves source mapping data for this SDFG. - :param progress: If True, prints a progress bar of the parsing process. - If None (default), prints after 5 seconds of parsing. + :param progress: If True, prints a progress bar of the parsing process. + If None (default), prints after 5 seconds of parsing. If False, never prints progress. :return: A 2-tuple of SDFG and its reduced (used) closure. """ @@ -1466,8 +1466,8 @@ def _parse_subprogram(self, name, node, is_tasklet=False, extra_symbols=None, ex def _symbols_from_params(self, params: List[Tuple[str, Union[str, dtypes.typeclass]]], memlet_inputs: Dict[str, Memlet]) -> Dict[str, symbolic.symbol]: """ - Returns a mapping between symbol names to their type, as a symbol - object to maintain compatibility with global symbols. Used to maintain + Returns a mapping between symbol names to their type, as a symbol + object to maintain compatibility with global symbols. Used to maintain typed symbols in SDFG scopes (e.g., map, consume). """ from dace.codegen.tools.type_inference import infer_expr_type @@ -1900,7 +1900,7 @@ def _parse_map_inputs(self, name: str, params: List[Tuple[str, str]], def _parse_consume_inputs(self, node: ast.FunctionDef) -> Tuple[str, str, Tuple[str, str], str, str]: """ Parse consume parameters from AST. - + :return: A 5-tuple of Stream name, internal stream name, (PE index, number of PEs), condition, chunk size. """ @@ -2179,7 +2179,7 @@ def _add_dependencies(self, state.add_nedge(internal_node, exit_node, dace.Memlet()) def _add_nested_symbols(self, nsdfg_node: nodes.NestedSDFG): - """ + """ Adds symbols from nested SDFG mapping values (if appear as globals) to current SDFG. """ @@ -3983,7 +3983,8 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no # Change transient names arrays_before = list(sdfg.arrays.items()) for arrname, array in arrays_before: - if array.transient and arrname[:5] == '__tmp': + print(arrname, array, sdfg.arrays, sdfg.size_arrays()) + if array.transient and arrname[:5] == '__tmp' and arrname not in sdfg.size_arrays(): if int(arrname[5:]) < self.sdfg._temp_transients: if self.sdfg._temp_transients > sdfg._temp_transients: new_name = self.sdfg.temp_data_name() diff --git a/dace/memlet.py b/dace/memlet.py index 85bd0a348d..8d396d8e4c 100644 --- a/dace/memlet.py +++ b/dace/memlet.py @@ -68,9 +68,9 @@ def __init__(self, debuginfo: Optional[dtypes.DebugInfo] = None, wcr_nonatomic: bool = False, allow_oob: bool = False): - """ + """ Constructs a Memlet. - + :param expr: A string expression of the this memlet, given as an ease of use API. Must follow one of the following forms: 1. ``ARRAY``, @@ -82,7 +82,7 @@ def __init__(self, :param subset: The subset to take from the data attached to the edge, represented either as a string or a Subset object. :param other_subset: The subset to offset into the other side of the - memlet, represented either as a string or a Subset + memlet, represented either as a string or a Subset object. :param volume: The exact number of elements moved using this memlet, or the maximum number of elements if @@ -91,14 +91,14 @@ def __init__(self, is runtime-defined and unbounded. :param dynamic: If True, the number of elements moved in this memlet is defined dynamically at runtime. - :param wcr: A lambda function (represented as a string or Python AST) + :param wcr: A lambda function (represented as a string or Python AST) specifying how write-conflicts are resolved. The syntax - of the lambda function receives two elements: ``current`` - value and `new` value, and returns the value after + of the lambda function receives two elements: ``current`` + value and `new` value, and returns the value after resolution. For example, summation is represented by ``'lambda cur, new: cur + new'``. :param debuginfo: Line information from the generating source code. - :param wcr_nonatomic: If True, overrides the automatic code generator + :param wcr_nonatomic: If True, overrides the automatic code generator decision and treat all write-conflict resolution operations as non-atomic, which might cause race conditions in the general case. @@ -225,16 +225,16 @@ def __deepcopy__(self, memo): return node def is_empty(self) -> bool: - """ + """ Returns True if this memlet carries no data. Memlets without data are - primarily used for connecting nodes to scopes without transferring - data to them. + primarily used for connecting nodes to scopes without transferring + data to them. """ return (self.data is None and self.subset is None and self.other_subset is None) @property def num_accesses(self): - """ + """ Returns the total memory movement volume (in elements) of this memlet. """ return self.volume @@ -255,7 +255,7 @@ def simple(data, """ DEPRECATED: Constructs a Memlet from string-based expressions. - :param data: The data object or name to access. + :param data: The data object or name to access. :param subset_str: The subset of `data` that is going to be accessed in string format. Example: '0:N'. :param wcr_str: A lambda function (as a string) specifying @@ -335,7 +335,7 @@ def _parse_from_subexpr(self, expr: str): # [subset] syntax if expr.startswith('['): return None, SubsetProperty.from_string(expr[1:-1]) - + # array[subset] syntax arrname, subset_str = expr[:-1].split('[') if not dtypes.validate_name(arrname): @@ -385,8 +385,8 @@ def _parse_memlet_from_str(self, expr: str): def try_initialize(self, sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState', edge: 'dace.sdfg.graph.MultiConnectorEdge'): - """ - Tries to initialize the internal fields of the memlet (e.g., src/dst + """ + Tries to initialize the internal fields of the memlet (e.g., src/dst subset) once it is added to an SDFG as an edge. """ from dace.sdfg.nodes import AccessNode, CodeNode # Avoid import loops @@ -435,7 +435,7 @@ def get_dst_subset(self, edge: 'dace.sdfg.graph.MultiConnectorEdge', state: 'dac @staticmethod def from_array(dataname, datadesc, wcr=None): - """ + """ Constructs a Memlet that transfers an entire array's contents. :param dataname: The name of the data descriptor in the SDFG. @@ -456,7 +456,7 @@ def __eq__(self, other): def replace(self, repl_dict): """ Substitute a given set of symbols with a different set of symbols. - + :param repl_dict: A dict of string symbol names to symbols with which to replace them. """ @@ -538,8 +538,8 @@ def validate(self, sdfg, state): def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]: """ - Returns a set of symbols used in this edge's properties. - + Returns a set of symbols used in this edge's properties. + :param all_symbols: If False, only returns the set of symbols that will be used in the generated code and are needed as arguments. :param edge: If given, provides richer context-based tests for the case @@ -606,7 +606,7 @@ def get_free_symbols_by_indices(self, indices_src: List[int], indices_dst: List[ def get_stride(self, sdfg: 'dace.sdfg.SDFG', map: 'dace.sdfg.nodes.Map', dim: int = -1) -> 'dace.symbolic.SymExpr': """ Returns the stride of the underlying memory when traversing a Map. - + :param sdfg: The SDFG in which the memlet resides. :param map: The map in which the memlet resides. :param dim: The dimension that is incremented. By default it is the innermost. diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py index 940114bbe2..3a1507f535 100644 --- a/dace/sdfg/infer_types.py +++ b/dace/sdfg/infer_types.py @@ -51,9 +51,9 @@ def infer_out_connector_type(sdfg: SDFG, state: SDFGState, node: nodes.CodeNode, def infer_connector_types(sdfg: SDFG): - """ + """ Infers connector types throughout an SDFG and its nested SDFGs in-place. - + :param sdfg: The SDFG to infer. """ # Loop over states, and in a topological sort over each state's nodes @@ -124,13 +124,13 @@ def set_default_schedule_and_storage_types(scope: Union[SDFG, SDFGState, nodes.E use_parent_schedule: bool = False, state: SDFGState = None, child_nodes: Dict[nodes.Node, List[nodes.Node]] = None): - """ + """ Sets default storage and schedule types throughout SDFG in-place. Replaces ``ScheduleType.Default`` and ``StorageType.Default`` - with the corresponding types according to the parent scope's schedule. - + with the corresponding types according to the parent scope's schedule. + The defaults for storage types are determined by the - ``dtypes.SCOPEDEFAULT_STORAGE`` dictionary (for example, a GPU device + ``dtypes.SCOPEDEFAULT_STORAGE`` dictionary (for example, a GPU device schedule, by default, will allocate containers on the shared memory). Following storage type inference for a scope, nested scopes (e.g., map entry, nested SDFG) are evaluated using the ``dtypes.STORAGEDEFAULT_SCHEDULE`` dictionary (for example, a diff --git a/dace/sdfg/replace.py b/dace/sdfg/replace.py index 2f9ead4dcd..ef4f06a548 100644 --- a/dace/sdfg/replace.py +++ b/dace/sdfg/replace.py @@ -54,7 +54,7 @@ def _replsym(symlist, symrepl): def replace_dict(subgraph: 'StateSubgraphView', repl: Dict[str, str], symrepl: Optional[Dict[symbolic.SymbolicType, symbolic.SymbolicType]] = None): - """ + """ Finds and replaces all occurrences of a set of symbols/arrays in the given subgraph. :param subgraph: The given graph or subgraph to replace in. @@ -86,7 +86,7 @@ def replace_dict(subgraph: 'StateSubgraphView', def replace(subgraph: 'StateSubgraphView', name: str, new_name: str): """ Finds and replaces all occurrences of a symbol or array in the given subgraph. - + :param subgraph: The given graph or subgraph to replace in. :param name: Name to find. :param new_name: Name to replace. diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 4a141aef12..e1d4962a02 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -101,8 +101,8 @@ def _nested_arrays_from_json(obj, context=None): return NestedDict({k: dace.serialize.from_json(v, context) for k, v in obj.items()}) -def _replace_dict_keys(d, old, new): - if old in d: +def _replace_dict_keys(d, old, new, filter_set=None): + if old in d and (filter_set is None or old in filter_set): if new in d: warnings.warn('"%s" already exists in SDFG' % new) d[new] = d[old] @@ -683,6 +683,14 @@ def arrays(self): """ return self._arrays + def size_arrays(self): + size_arrays = [k for k, v in self.arrays.items() if hasattr(v, "is_size_array") and v.is_size_array is True] + return size_arrays + + @property + def arrays(self): + return self._arrays + @property def process_grids(self): """ Returns a dictionary of process-grid descriptors (`ProcessGrid` objects) used in this SDFG. """ @@ -740,17 +748,56 @@ def replace_dict(self, } # Replace in arrays and symbols (if a variable name) + size_arrays = self.sdfg.size_arrays() + non_size_arrays = {k for k in self.arrays if k not in size_arrays} + size_desc_map = dict() + if replace_keys: # Filter out nested data names, as we cannot and do not want to replace names in nested data descriptors repldict_filtered = {k: v for k, v in repldict.items() if '.' not in k} for name, new_name in repldict_filtered.items(): if validate_name(new_name): - _replace_dict_keys(self._arrays, name, new_name) + _replace_dict_keys(self.arrays, name, new_name, non_size_arrays) + # Size desc names are updated later + if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return, fpga__return + size_desc_map[new_name] = new_name + "_size" + else: + size_desc_map[new_name] = None _replace_dict_keys(self.symbols, name, new_name) _replace_dict_keys(self.constants_prop, name, new_name) _replace_dict_keys(self.callback_mapping, name, new_name) _replace_dict_values(self.callback_mapping, name, new_name) + # Update size descriptors + # Having return_size break things (it is collected to the tuple of return) delete it from the arrays + # If this is called because array's properties had been changed then set the size desc to none + size_ararys_to_rm = set() + for arr_name, size_desc_name in size_desc_map.items(): + arr = self.arrays[arr_name] if arr_name in self.arrays else None + if arr is not None and type(arr) == dt.Array: + size_desc_name_before = arr.size_desc_name + # If we change the name of an array, then we need to change its size array accordingly + if (arr.transient and size_desc_name_before is not None): + arr.size_desc_name = size_desc_name + assert (arr.size_desc_name == size_desc_name) + self.arrays[size_desc_name] = self.arrays.pop(size_desc_name_before) + # If the new size array is None, then we can remove the previous (and now unused size array) + if arr.size_desc_name is None and size_desc_name_before is not None: + size_ararys_to_rm.add(size_desc_name_before) + # If the new size array is not None, but it was non before we need to add the size array + if size_desc_name_before is None and arr.size_desc_name is not None: + retval = self._get_size_arr(arr_name, arr) + if retval is not None: + size_desc_name, size_desc = retval + assert (size_desc_name == arr.size_desc_name) + self._arrays[size_desc_name] = size_desc + self._add_symbols(size_desc) + + # Rm any size array we need to remove + for size_arr_name in size_ararys_to_rm: + if size_arr_name in self.arrays: + del self.arrays[size_arr_name] + # Replace inside data descriptors for array in self.arrays.values(): replace_properties_dict(array, repldict, symrepl) @@ -1085,7 +1132,7 @@ def as_schedule_tree(self, in_place: bool = False) -> 'ScheduleTreeScope': the execution order of the SDFG. Each node in the tree can either represent a single statement (symbol assignment, tasklet, copy, library node, etc.) or a ``ScheduleTreeScope`` block (map, for-loop, pipeline, etc.) that contains other nodes. - + It can be used to generate code from an SDFG, or to perform schedule transformations on the SDFG. For example, erasing an empty if branch, or merging two consecutive for-loops. @@ -1151,8 +1198,16 @@ def remove_data(self, name, validate=True): f"{name}: it is accessed by node " f"{node} in state {state}.") + # Check for size desc + if type(self._arrays[name]) == dt.Array: + size_desc_name = self._arrays[name].size_desc_name + # If unused it might have been removed by optimization + if size_desc_name is not None and size_desc_name in self._arrays: + del self._arrays[size_desc_name] + del self._arrays[name] + def reset_sdfg_list(self): """ Reset the CFG list when changes have been made to the SDFG's CFG tree. @@ -1734,22 +1789,24 @@ def add_array(self, alignment=0, may_alias=False) -> Tuple[str, dt.Array]: """ Adds an array to the SDFG data descriptor store. """ - - # convert strings to int if possible + # convert strings to int if possible, unless it is not the reserved symbol for deferred allocation newshape = [] - for s in shape: - try: - newshape.append(int(s)) - except: - newshape.append(dace.symbolic.pystr_to_symbolic(s)) + for i, s in enumerate(shape): + if isinstance(s, str) and "__dace_defer" in s: + newshape.append(dace.symbolic.pystr_to_symbolic(f"{s}_dim{i}")) + else: + try: + newshape.append(int(s)) + except: + newshape.append(dace.symbolic.pystr_to_symbolic(s)) shape = newshape strides = strides or None if isinstance(dtype, type) and dtype in dtypes._CONSTANT_TYPES[:-1]: dtype = dtypes.typeclass(dtype) - desc = dt.Array(dtype, - shape, + desc = dt.Array(dtype=dtype, + shape=shape, storage=storage, location=location, allow_conflicts=allow_conflicts, @@ -1760,9 +1817,11 @@ def add_array(self, alignment=alignment, debuginfo=debuginfo, total_size=total_size, - may_alias=may_alias) + may_alias=may_alias, + size_desc_name=None) - return self.add_datadesc(name, desc, find_new_name=find_new_name), desc + array_name = self.add_datadesc(name, desc, find_new_name=find_new_name) + return array_name, desc def add_view(self, name: str, @@ -2009,6 +2068,46 @@ def add_temp_transient_like(self, desc: Union[dt.Array, dt.Scalar], dtype=None, newdesc.debuginfo = debuginfo return self.add_datadesc(self.temp_data_name(), newdesc), newdesc + def _add_symbols(self, desc: dt.Data): + if isinstance(desc, dt.Structure): + for v in desc.members.values(): + if isinstance(v, dt.Data): + self._add_symbols(v) + for sym in desc.free_symbols: + if sym.name not in self.symbols: + self.add_symbol(sym.name, sym.dtype) + + def _get_size_arr(self, name: str, datadesc: dt.Data): + if ( + datadesc.transient is True and + type(datadesc) == dt.Array and + "__return" not in name and + datadesc.lifetime is not dtypes.AllocationLifetime.External and + datadesc.lifetime is not dtypes.AllocationLifetime.Persistent and + datadesc.is_deferred_array + ): + size_desc_name = f"{name}_size" + # Regardless of the scope and storage it is allocated as a register array + # And at the start of the SDFG (or nested SDFG), not setting SDFG prevents to_gpu assertions + # from failing. To lifetime and storage are set explicitly to + # to prevent optimizations to putting them to FPGA/GPU storage + size_desc = dt.Array(dtype=dace.uint64, + shape=(len(datadesc.shape),), + storage=dtypes.StorageType.CPU_Heap, + location=None, + allow_conflicts=False, + transient=True, + strides=(1,), + offset=(0,), + lifetime=dtypes.AllocationLifetime.State, + alignment=datadesc.alignment, + debuginfo=datadesc.debuginfo, + may_alias=False, + size_desc_name=None) + size_desc.is_size_array = True + return size_desc_name, size_desc + return None + def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str: """ Adds an existing data descriptor to the SDFG array store. @@ -2049,18 +2148,18 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str if name in self._pgrids: raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a ProcessGrid.') - def _add_symbols(sdfg: SDFG, desc: dt.Data): - if isinstance(desc, dt.Structure): - for v in desc.members.values(): - if isinstance(v, dt.Data): - _add_symbols(sdfg, v) - for sym in desc.free_symbols: - if sym.name not in sdfg.symbols: - sdfg.add_symbol(sym.name, sym.dtype) - # Add the data descriptor to the SDFG and all symbols that are not yet known. self._arrays[name] = datadesc - _add_symbols(self, datadesc) + self._add_symbols(datadesc) + + retval = self._get_size_arr(name, datadesc) + if retval is not None: + size_desc_name, size_desc = retval + self._arrays[size_desc_name] = size_desc + # In case find_new_name and a new name is returned + # we need to update the size descriptor name of the array + datadesc.size_desc_name = size_desc_name + self._add_symbols(size_desc) return name @@ -2400,8 +2499,8 @@ def argument_typecheck(self, args, kwargs, types_only=False): # Omit return values from arguments expected_args = collections.OrderedDict([(k, v) for k, v in expected_args.items() - if not k.startswith('__return')]) - kwargs = {k: v for k, v in kwargs.items() if not k.startswith('__return')} + if '__return' not in k]) + kwargs = {k: v for k, v in kwargs.items() if '__return' not in k} num_args_passed = len(args) + len(kwargs) num_args_expected = len(expected_args) @@ -2519,7 +2618,7 @@ def auto_optimize(self, """ Runs a basic sequence of transformations to optimize a given SDFG to decent performance. In particular, performs the following: - + * Simplify * Auto-parallelization (loop-to-map) * Greedy application of SubgraphFusion diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 30640306cd..e8e99d30c1 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -247,7 +247,7 @@ def read_and_write_sets(self) -> Tuple[Set[AnyStr], Set[AnyStr]]: """ Determines what data is read and written in this graph. Does not include reads to subsets of containers that have previously been written within the same state. - + :return: A two-tuple of sets of things denoting ({data read}, {data written}). """ return set(), set() @@ -822,7 +822,7 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr, def read_and_write_sets(self) -> Tuple[Set[AnyStr], Set[AnyStr]]: """ Determines what data is read and written in this subgraph. - + :return: A two-tuple of sets of things denoting ({data read}, {data written}). """ @@ -2729,7 +2729,7 @@ def inline(self, lower_returns: bool = False) -> Tuple[bool, Any]: for b_edge in parent.in_edges(self): parent.add_edge(b_edge.src, self.start_block, b_edge.data) parent.remove_edge(b_edge) - + end_state = None if len(to_connect) > 0: end_state = parent.add_state(self.label + '_end') @@ -3474,6 +3474,9 @@ def add_branch(self, condition: Optional[CodeBlock], branch: ControlFlowRegion): def remove_branch(self, branch: ControlFlowRegion): self._branches = [(c, b) for c, b in self._branches if b is not branch] + def edges(self) -> List[Edge['dace.sdfg.InterstateEdge']]: + return [] + def get_meta_codeblocks(self): codes = [] for c, _ in self.branches: @@ -3489,7 +3492,7 @@ def get_meta_read_memlets(self) -> List[mm.Memlet]: if c is not None: read_memlets.extend(memlets_in_ast(c.code[0], self.sdfg.arrays)) return read_memlets - + def _used_symbols_internal(self, all_symbols: bool, defined_syms: Optional[Set] = None, @@ -3537,7 +3540,7 @@ def to_json(self, parent=None): json['branches'] = [(condition.to_json() if condition is not None else None, cfg.to_json()) for condition, cfg in self._branches] return json - + @classmethod def from_json(cls, json_obj, context=None): context = context or {'sdfg': None, 'parent_graph': None} diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 5ad00a2942..3b11ba88f3 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -277,6 +277,12 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context 'rather than using multiple references to the same one', sdfg, None) references.add(id(desc)) + if (name.endswith("_size") and desc.transient and type(desc) == dt.Array and + hasattr(desc, "is_size_array") and desc.is_size_array is False): + raise InvalidSDFGError( + f'Only size arrays allowed to end with _size, desc: {desc}, storage: {desc.storage}, transient: {desc.transient}', sdfg, None + ) + # Because of how the code generator works Scalars can not be return values. # TODO: Remove this limitation as the CompiledSDFG contains logic for that. if isinstance(desc, dt.Scalar) and name.startswith("__return") and not desc.transient: @@ -318,6 +324,47 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context "Arrays that use a multibank access pattern must have the size of the first dimension equal" f" the number of banks and have at least 2 dimensions for array {name}", sdfg, None) + # Check the size array shapes match + if type(desc) == dt.Array: + if desc.is_size_array is False and desc.size_desc_name is not None: + # It is an array which is not a size array and needs to have a size array + size_desc = sdfg._arrays[desc.size_desc_name] + size_arr_len = size_desc.shape[0] + if not isinstance(size_arr_len, int) and (isinstance(size_arr_len, dace.symbolic.symbol) and not size_arr_len.is_integer): + raise InvalidSDFGError( + f"Size arrays need to be one-dimensional and have an integer length known at compile time. {desc.size_desc_name}: {size_desc.shape}" + , sdfg, None + ) + # TODO: This check can be implemented as part of a getter/setter on the dimensions of the array? + if int(size_arr_len) != len(desc.shape): + raise InvalidSDFGError( + f"Size arrays size needs to match to shape of its array: {desc.size_desc_name}, {size_desc.shape}: {name}, {desc.shape}" + , sdfg, None + ) + + if isinstance(desc, dt.Array): #is_deferred_array and is_size_array are only defined for dt.Array + if desc.is_deferred_array: + if desc.is_size_array: + raise InvalidSDFGError( + f"A deferred array can't be used as a size array for another array. Data descriptor name: {desc}." + , sdfg, None + ) + if not desc.transient: + raise InvalidSDFGError( + f"Deferred arrays need to be transient." + , sdfg, None + ) + if "__return" in name: + raise InvalidSDFGError( + f"Deferred arrays can't be returned. {desc} has __return in its name." + , sdfg, None + ) + if desc.storage not in dtypes.REALLOCATABLE_STORAGES: + raise InvalidSDFGError( + f"Deferred arrays are supported only for {dtypes.REALLOCATABLE_STORAGES} storage types for {desc}." + , sdfg, None + ) + # Check if SDFG is located within a GPU kernel context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None) context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None) @@ -561,6 +608,63 @@ def validate_state(state: 'dace.sdfg.SDFGState', 'written to, but only given to nested SDFG as an ' 'input connector' % node.data, sdfg, state_id, nid) + # Deferred allocation related tests + insize = "_write_size" + outsize = "_read_size" + read_size_edges = list(state.edges_by_connector(node, outsize)) + write_size_edges = list(state.edges_by_connector(node, insize)) + + # Reading-Writing the size is valid only if the array is transient and has the storage type CPU_Heap or GPU_Global + has_writes = len(write_size_edges) > 0 + has_writes_or_reads = len(read_size_edges) + len(write_size_edges) > 0 + size_access_allowed = arr.transient and (arr.storage in dtypes.REALLOCATABLE_STORAGES) + if has_writes_or_reads and not size_access_allowed: + raise InvalidSDFGNodeError('Reading the size of an array, or changing (writing to) the size of an array ' + f'is only valid if the array is transient and the storage is in {dtypes.REALLOCATABLE_STORAGES}', sdfg, state_id, nid) + if has_writes and scope[node] is not None: + raise InvalidSDFGNodeError('Resizing array is not allowed within a scope (e.g. not inside maps)', sdfg, state_id, nid) + + + if len(write_size_edges) > 1: + raise InvalidSDFGNodeError('One node can have at maximum one edge writing to its size descriptior', sdfg, state_id, nid) + + # The write needs to always have the same length of the dimension of the node + if len(write_size_edges) == 1: + write_size_edge = write_size_edges[0] + edge_id = state.edge_id(write_size_edge) + required_range = len(arr.shape) + try: + elements = int(write_size_edge.data.num_elements()) + if elements != required_range or write_size_edge.data.subset.dims() != 1: + raise Exception + except Exception: + raise InvalidSDFGEdgeError('The write to a node needs to match the length of the array shape ' + 'the volume needs to be integer (not symbolic) and the shape one dimensional', sdfg, state_id, edge_id) + + # Reads to map can be only scalars-sized + for read_size_edge in read_size_edges: + edge_id = state.edge_id(read_size_edge) + from dace import nodes + if (isinstance(read_size_edge.dst, nodes.EntryNode) or + isinstance(read_size_edge.dst, nodes.AccessNode) or + isinstance(read_size_edge.dst, nodes.Tasklet)): + if isinstance(read_size_edge.dst, nodes.MapEntry): + required_range = 1 + try: + elements = int(read_size_edge.data.num_elements()) + if elements != required_range and read_size_edge.data.subset.dims() != 1: + raise Exception() + except Exception: + raise InvalidSDFGEdgeError('The read to a map entry needs have dimension 1' + 'If reading multiple dimensions, multiple edges need to go to the map entry', sdfg, state_id, edge_id) + else: + raise InvalidSDFGEdgeError('The read size should connect to an entry node, access node, or tasklet (this can be changed)' + , sdfg, state_id, edge_id) + + + + + if (isinstance(node, nd.ConsumeEntry) and "IN_stream" not in node.in_connectors): raise InvalidSDFGNodeError("Consume entry node must have an input stream", sdfg, state_id, nid) if (isinstance(node, nd.ConsumeEntry) and "OUT_stream" not in node.out_connectors): @@ -712,9 +816,38 @@ def validate_state(state: 'dace.sdfg.SDFGState', name = None if isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Structure): name = None + # Special case: if the name is the size array of the src_node, then it is ok, checked with the "size_desc_name" + src_size_access = isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Array) and name is not None and name == sdfg.arrays[src_node.data].size_desc_name + dst_size_access = isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Array) and name is not None and name == sdfg.arrays[dst_node.data].size_desc_name + sdict = state.scope_dict() + if src_size_access and dst_size_access: + raise InvalidSDFGEdgeError( + "Reading from the size connector and writing to the size connector at the same time of same data is not valid", + sdfg, + state_id, + eid, + ) + if dst_size_access and sdict[dst_node] is not None: + raise InvalidSDFGEdgeError( + "Reallocating data (writing to the size connector) within a scope is not valid", + sdfg, + state_id, + eid, + ) + if dst_size_access: + dst_arr = sdfg.arrays[dst_node.data] + if (dst_arr.storage != dtypes.StorageType.GPU_Global and + dst_arr.storage != dtypes.StorageType.CPU_Heap): + raise InvalidSDFGEdgeError( + f"Reallocating data is allowed only to GPU_Global or CPU_Heap, the storage type is {dst_arr.storage}", + sdfg, + state_id, + eid, + ) if (name is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode)) - and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn)) - and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn))): + and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn and not src_size_access)) + and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn and not dst_size_access)) + ): raise InvalidSDFGEdgeError( "Memlet data does not match source or destination " "data nodes)", @@ -743,7 +876,12 @@ def validate_state(state: 'dace.sdfg.SDFGState', if isinstance(dst_node, nd.AccessNode) and e.data.data != dst_node.data else src_node) if isinstance(subset_node, nd.AccessNode): - arr = sdfg.arrays[e.data.data] + if src_size_access: + arr = sdfg.arrays[sdfg.arrays[src_node.data].size_desc_name] + elif dst_size_access: + arr = sdfg.arrays[sdfg.arrays[dst_node.data].size_desc_name] + else: + arr = sdfg.arrays[e.data.data] # Dimensionality if e.data.subset.dims() != len(arr.shape): raise InvalidSDFGEdgeError( @@ -833,10 +971,11 @@ def validate_state(state: 'dace.sdfg.SDFGState', # Check dimensionality of memory access if isinstance(e.data.subset, (sbs.Range, sbs.Indices)): - if e.data.subset.dims() != len(sdfg.arrays[e.data.data].shape): + desc = sdfg.arrays[e.data.data] + if e.data.subset.dims() != len(desc.shape): raise InvalidSDFGEdgeError( "Memlet subset uses the wrong dimensions" - " (%dD for a %dD data node)" % (e.data.subset.dims(), len(sdfg.arrays[e.data.data].shape)), + " (%dD for a %dD data node)" % (e.data.subset.dims(), len(desc.shape)), sdfg, state_id, eid, @@ -845,8 +984,8 @@ def validate_state(state: 'dace.sdfg.SDFGState', # Verify that source and destination subsets contain the same # number of elements if not e.data.allow_oob and e.data.other_subset is not None and not ( - (isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Stream)) or - (isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Stream))): + (isinstance(src_node, nd.AccessNode) and src_node.data in sdfg.arrays and isinstance(sdfg.arrays[src_node.data], dt.Stream)) or + (isinstance(dst_node, nd.AccessNode) and src_node.data in sdfg.arrays and isinstance(sdfg.arrays[dst_node.data], dt.Stream))): src_expr = (e.data.src_subset.num_elements() * sdfg.arrays[src_node.data].veclen) dst_expr = (e.data.dst_subset.num_elements() * sdfg.arrays[dst_node.data].veclen) if symbolic.inequal_symbols(src_expr, dst_expr): diff --git a/dace/subsets.py b/dace/subsets.py index 0fdc36c22e..e6d69e1a67 100644 --- a/dace/subsets.py +++ b/dace/subsets.py @@ -99,7 +99,7 @@ def covers(self, other): return False return True - + def covers_precise(self, other): """ Returns True if self contains all the elements in other. """ @@ -734,7 +734,7 @@ def compose(self, other): def squeeze(self, ignore_indices: Optional[List[int]] = None, offset: bool = True) -> List[int]: """ Removes size-1 ranges from the subset and returns a list of dimensions that remain. - + For example, ``[i:i+10, j]`` will change the range to ``[i:i+10]`` and return ``[0]``. If ``offset`` is True, the subset will become ``[0:10]``. @@ -770,7 +770,7 @@ def squeeze(self, ignore_indices: Optional[List[int]] = None, offset: bool = Tru def unsqueeze(self, axes: Sequence[int]) -> List[int]: """ Adds 0:1 ranges to the subset, in the indices contained in axes. - + The method is mostly used to restore subsets that had their length-1 ranges removed (i.e., squeezed subsets). Hence, the method is called 'unsqueeze'. @@ -1046,7 +1046,7 @@ def squeeze(self, ignore_indices=None): def unsqueeze(self, axes: Sequence[int]) -> List[int]: """ Adds zeroes to the subset, in the indices contained in axes. - + The method is mostly used to restore subsets that had their zero-indices removed (i.e., squeezed subsets). Hence, the method is called 'unsqueeze'. @@ -1112,7 +1112,7 @@ def __init__(self, subset): self.subset_list = [subset] def covers(self, other): - """ + """ Returns True if this SubsetUnion covers another subset (using a bounding box). If other is another SubsetUnion then self and other will only return true if self is other. If other is a different type of subset @@ -1128,13 +1128,13 @@ def covers(self, other): return False else: return any(s.covers(other) for s in self.subset_list) - + def covers_precise(self, other): - """ + """ Returns True if this SubsetUnion covers another subset. If other is another SubsetUnion then self and other will only return true if self is other. If other is a different type of subset - true is returned when one of the subsets in self is equal to other + true is returned when one of the subsets in self is equal to other """ if isinstance(other, SubsetUnion): @@ -1154,7 +1154,7 @@ def __str__(self): string += " " string += subset.__str__() return string - + def dims(self): if not self.subset_list: return 0 @@ -1178,7 +1178,7 @@ def free_symbols(self) -> Set[str]: for subset in self.subset_list: result |= subset.free_symbols return result - + def replace(self, repl_dict): for subset in self.subset_list: subset.replace(repl_dict) @@ -1192,15 +1192,15 @@ def num_elements(self): min = subset.num_elements() except: continue - + return min def _union_special_cases(arb: symbolic.SymbolicType, brb: symbolic.SymbolicType, are: symbolic.SymbolicType, bre: symbolic.SymbolicType): - """ - Special cases of subset unions. If case found, returns pair of + """ + Special cases of subset unions. If case found, returns pair of (min,max), otherwise returns None. """ if are + 1 == brb: @@ -1267,7 +1267,7 @@ def union(subset_a: Subset, subset_b: Subset) -> Subset: """ Compute the union of two Subset objects. If the subsets are not of the same type, degenerates to bounding-box union. - + :param subset_a: The first subset. :param subset_b: The second subset. :return: A Subset object whose size is at least the union of the two @@ -1303,7 +1303,7 @@ def union(subset_a: Subset, subset_b: Subset) -> Subset: def list_union(subset_a: Subset, subset_b: Subset) -> Subset: - """ + """ Returns the union of two Subset lists. :param subset_a: The first subset. diff --git a/dace/transformation/dataflow/redundant_array.py b/dace/transformation/dataflow/redundant_array.py index 5e5072ff32..ebccf93047 100644 --- a/dace/transformation/dataflow/redundant_array.py +++ b/dace/transformation/dataflow/redundant_array.py @@ -1675,7 +1675,7 @@ def _offset_subset(self, mapping: Dict[int, int], subset: subsets.Range, edge_su class RemoveIntermediateWrite(pm.SingleStateTransformation): """ Moves intermediate writes insde a Map's subgraph outside the Map. - + Currently, the transformation supports only the case `WriteAccess -> MapExit`, where the edge has an empty Memlet. """ diff --git a/dace/transformation/passes/array_elimination.py b/dace/transformation/passes/array_elimination.py index 803c81b21e..b327a9f608 100644 --- a/dace/transformation/passes/array_elimination.py +++ b/dace/transformation/passes/array_elimination.py @@ -37,7 +37,7 @@ def depends_on(self): def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[Set[str]]: """ Removes redundant arrays and access nodes. - + :param sdfg: The SDFG to modify. :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass results as ``{Pass subclass name: returned object from pass}``. If not run in a @@ -84,7 +84,12 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[S result.update({n.data for n in removed_nodes}) # If node is completely removed from graph, erase data descriptor - for aname, desc in list(sdfg.arrays.items()): + array_items = list(sdfg.arrays.items()) + size_descriptors = set([v.size_desc_name for v in sdfg.arrays.values() if type(v) == data.Array and v.size_desc_name is not None]) + for aname, desc in array_items: + # Remove size descriptors only if the original array is removed + if aname in size_descriptors: + continue if not desc.transient or isinstance(desc, data.Scalar): continue if aname not in access_sets or not access_sets[aname]: @@ -92,7 +97,10 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[S if isinstance(desc, data.Structure) and len(desc.members) > 0: continue sdfg.remove_data(aname, validate=False) + if desc.size_desc_name is not None: + sdfg.remove_data(desc.size_desc_name, validate=False) result.add(aname) + result.add(desc.size_desc_name) return result or None diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py new file mode 100644 index 0000000000..c1e6f7a2da --- /dev/null +++ b/tests/deferred_alloc_test.py @@ -0,0 +1,489 @@ +import dace +from dace.transformation.dataflow.redundant_array import RedundantArray, RedundantSecondArray +from dace.transformation.interstate.state_fusion import StateFusion +import numpy +import pytest + + +@pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global, dace.dtypes.StorageType.CPU_Pinned]) +def storage_type(request): + return request.param + +@pytest.fixture(params=[True, False]) +def transient(request): + return request.param + +@pytest.fixture +def schedule_type(storage_type): + if storage_type == dace.dtypes.StorageType.CPU_Heap: + return dace.dtypes.ScheduleType.Sequential + elif storage_type == dace.dtypes.StorageType.GPU_Global: + return dace.dtypes.ScheduleType.GPU_Device + elif storage_type == dace.dtypes.StorageType.CPU_Pinned: + return dace.dtypes.ScheduleType.Sequential + +def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"): + sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_1") + + sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type, transient=transient) + + state = sdfg.add_state("main") + + an_1 = state.add_access('A') + an_1.add_in_connector('_write_size') + + sdfg.add_array(name="user_size", shape=(2,), dtype=dace.uint64) + an_2 = state.add_access("user_size") + + state.add_edge(an_2, None, an_1, '_write_size', + dace.Memlet(expr=f"user_size[{write_size}]") ) + + return sdfg + +def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType.Default): + sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_2") + + sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type, + lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient) + + state = sdfg.add_state("main") + + an_1 = state.add_access('A') + an_1.add_in_connector('_write_size') + an_1.add_out_connector('_read_size') + + sdfg.add_array(name="user_size", shape=(2,), dtype=dace.uint64) + an_2 = state.add_access("user_size") + + state.add_edge(an_2, None, an_1, '_write_size', + dace.Memlet(expr="user_size[0:2]") ) + + map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) }, + schedule=schedule_type) + state.add_edge(an_1, '_read_size', map_entry, "__A_dim1_size", dace.Memlet(expr="A_size[1]")) + map_entry.add_in_connector("__A_dim1_size") + map_exit.add_in_connector("IN_A") + map_exit.add_out_connector("OUT_A") + + t1 = state.add_tasklet(name="assign", inputs={}, outputs={"_out"}, code="_out=3.0") + state.add_edge(map_entry, None, t1, None, dace.Memlet(None)) + state.add_edge(t1, "_out", map_exit, "IN_A", dace.Memlet(expr="A[i, j]")) + + an_3 = state.add_access('A') + state.add_edge(map_exit, "OUT_A", an_3, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)]))) + + arr_name, arr = sdfg.add_array(name="example_array", dtype=dace.float32, shape=(1,), transient=False, storage=storage_type) + arrn = state.add_access(arr_name) + + if storage_type == dace.dtypes.StorageType.CPU_Heap: + assert (schedule_type == dace.dtypes.ScheduleType.Sequential or schedule_type == dace.dtypes.ScheduleType.CPU_Multicore) + elif storage_type == dace.dtypes.StorageType.GPU_Global: + assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device) + elif storage_type == dace.dtypes.StorageType.CPU_Pinned: + assert (schedule_type == dace.dtypes.ScheduleType.Sequential) + + an_3.add_out_connector('_read_size') + map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)])}, + schedule=schedule_type) + state.add_edge(an_3, '_read_size', map_entry2, "__A_dim1_size", dace.Memlet(expr="A_size[1]")) + state.add_edge(an_3, None, map_entry2, "IN_A", dace.Memlet(expr="A[0:15, 0:__A_dim1_size]")) + map_entry2.add_in_connector("__A_dim1_size") + map_entry2.add_in_connector("IN_A") + map_entry2.add_out_connector("OUT_A") + map_exit2.add_in_connector("IN_A") + map_exit2.add_out_connector("OUT_A") + + t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='_out = _in', language=dace.dtypes.Language.Python) + state.add_edge(map_entry2, "OUT_A", t2, "_in", dace.Memlet(expr="A[i, j]")) + state.add_edge(t2, "_out", map_exit2, "IN_A", dace.Memlet(expr="A[i, j]")) + + an_5 = state.add_access('A') + state.add_edge(map_exit2, "OUT_A", an_5, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)]))) + + state.add_edge(an_5, None, arrn, None, dace.memlet.Memlet("A[7, 7]")) + + return sdfg + +def _valid_to_reallocate(transient, storage_type): + return transient and (storage_type in dace.dtypes.REALLOCATABLE_STORAGES) + +def _test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool): + sdfg = _get_trivial_alloc_sdfg(storage_type, transient) + try: + sdfg.validate() + except Exception: + if not _valid_to_reallocate(transient, storage_type): + return + else: + raise AssertionError("Realloc with transient data failed when it was expected not to.") + + if not _valid_to_reallocate(transient, storage_type): + raise AssertionError("Realloc with non-transient data did not fail when it was expected to.") + + sdfg.compile() + + sdfg.simplify() + sdfg.apply_transformations_repeated([StateFusion, RedundantArray, RedundantSecondArray]) + sdfg.validate() + sdfg.compile() + + +def _test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType): + sdfg = _get_assign_map_sdfg(storage_type, transient, schedule_type) + try: + sdfg.validate() + except Exception: + if not _valid_to_reallocate(transient, storage_type): + return + else: + raise AssertionError("Realloc-use with transient data failed when it was expected not to.") + + if not _valid_to_reallocate(transient, storage_type): + raise AssertionError("Realloc-use with non-transient data did not fail when it was expected to.") + + compiled_sdfg = sdfg.compile() + if storage_type == dace.dtypes.StorageType.CPU_Heap or storage_type == dace.dtypes.StorageType.CPU_Pinned: + arr = numpy.array([-1.0]).astype(numpy.float32) + user_size = numpy.array([10, 10]).astype(numpy.uint64) + compiled_sdfg(user_size=user_size, example_array=arr) + assert ( arr[0] == 3.0 ) + elif storage_type == dace.dtypes.StorageType.GPU_Global: + try: + import cupy + except Exception: + return + + arr = cupy.array([-1.0]).astype(cupy.float32) + user_size = numpy.array([10, 10]).astype(numpy.uint64) + compiled_sdfg(user_size=user_size, example_array=arr) + assert ( arr.get()[0] == 3.0 ) + + sdfg.simplify() + sdfg.apply_transformations_repeated([StateFusion, RedundantArray, RedundantSecondArray]) + sdfg.validate() + compiled_sdfg = sdfg.compile() + if storage_type == dace.dtypes.StorageType.CPU_Heap or storage_type == dace.dtypes.StorageType.CPU_Pinned: + arr = numpy.array([-1.0]).astype(numpy.float32) + user_size = numpy.array([10, 10]).astype(numpy.uint64) + compiled_sdfg(user_size=user_size, example_array=arr) + assert ( arr[0] == 3.0 ) + elif storage_type == dace.dtypes.StorageType.GPU_Global: + try: + import cupy + except Exception: + return + + arr = cupy.array([-1.0]).astype(cupy.float32) + user_size = numpy.array([10, 10]).astype(numpy.uint64) + compiled_sdfg(user_size=user_size, example_array=arr) + assert ( arr.get()[0] == 3.0 ) + +@pytest.mark.gpu +def test_realloc_use_gpu(transient: bool): + _test_realloc_use(dace.dtypes.StorageType.GPU_Global, transient, dace.dtypes.ScheduleType.GPU_Device) + +def test_realloc_use_cpu(transient: bool): + _test_realloc_use(dace.dtypes.StorageType.CPU_Heap, transient, dace.dtypes.ScheduleType.Sequential) + +@pytest.mark.gpu +def test_realloc_use_cpu_pinned(transient: bool): + _test_realloc_use(dace.dtypes.StorageType.CPU_Pinned, transient, dace.dtypes.ScheduleType.Sequential) + +@pytest.mark.gpu +def test_trivial_realloc_gpu(transient: bool): + _test_trivial_realloc(dace.dtypes.StorageType.GPU_Global, transient) + +@pytest.mark.gpu +def test_trivial_realloc_cpu_pinned(transient: bool): + _test_trivial_realloc(dace.dtypes.StorageType.CPU_Pinned, transient) + +def test_trivial_realloc_cpu(transient: bool): + _test_trivial_realloc(dace.dtypes.StorageType.CPU_Heap, transient) + + +def _add_realloc_inside_map(sdfg: dace.SDFG, schedule_type: dace.dtypes.ScheduleType): + pre_state = sdfg.states()[0] + state = sdfg.add_state("s2") + sdfg.add_edge(pre_state, state, dace.InterstateEdge(None, None)) + + map_entry, map_exit = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,4,1)])}, + schedule=schedule_type) + an_2 = state.add_access('A') + an_2.add_in_connector("_write_size") + + t1 = state.add_tasklet(name="assign", inputs={}, outputs={"__out"}, code="_out=8") + t1.add_out_connector("__out") + + _, _ = sdfg.add_array("tmp0", shape=(2, ), dtype=numpy.uint64, transient=True) + sca = state.add_access("tmp0") + + state.add_edge(map_entry, None, t1, None, dace.Memlet(None)) + state.add_edge(t1, "__out", sca, None, dace.Memlet("tmp0[0]")) + state.add_edge(sca, None, an_2, "_write_size", dace.Memlet("tmp0")) + state.add_edge(an_2, None, map_exit, None, dace.Memlet(None)) + +def test_realloc_inside_map_gpu(): + sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device) + _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.GPU_Device) + try: + sdfg.validate() + except Exception: + return + + pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.") + +def test_realloc_inside_map_cpu_pinned(): + sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.CPU_Pinned, True, dace.dtypes.ScheduleType.Sequential) + _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.Sequential) + try: + sdfg.validate() + except Exception: + return + + pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.") + + +def test_realloc_inside_map_cpu(): + sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore) + _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.CPU_Multicore) + try: + sdfg.validate() + except Exception: + return + + pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.") + +def _get_conditional_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType, defer_expr_instead_of_symbol: bool = False): + sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_2") + + if not defer_expr_instead_of_symbol: + sdfg.add_array(name="A", shape=("__dace_defer", "__dace_defer"), dtype=dace.float32, storage=storage_type, + lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient) + else: + sdfg.add_array(name="A", shape=("4 * __dace_defer", "8 * __dace_defer"), dtype=dace.float32, storage=storage_type, + lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient) + + sdfg.add_scalar(name="path", transient=False, dtype=numpy.uint64) + + start = sdfg.add_state("s1") + iftrue = sdfg.add_state("s1_0") + iffalse = sdfg.add_state("s1_1") + assigntrue = sdfg.add_state("s2_0") + assignfalse = sdfg.add_state("s2_1") + state = sdfg.add_state("s3") + + sdfg.add_edge(start, iftrue, dace.InterstateEdge("path == 1")) + sdfg.add_edge(start, iffalse, dace.InterstateEdge("path != 1")) + sdfg.add_edge(iftrue, assigntrue, dace.InterstateEdge(None)) + sdfg.add_edge(iffalse, assignfalse, dace.InterstateEdge(None)) + sdfg.add_edge(assigntrue, state, dace.InterstateEdge(None)) + sdfg.add_edge(assignfalse, state, dace.InterstateEdge(None)) + + s1name, s1 = sdfg.add_array(name="size1", shape=(2,), dtype=numpy.uint64, storage=dace.dtypes.StorageType.Register, + lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=False) + s2name, s2 = sdfg.add_array(name="size2", shape=(2,), dtype=numpy.uint64, storage=dace.dtypes.StorageType.Register, + lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=False) + + an_2_0 = assigntrue.add_access('A') + an_2_0.add_in_connector('_write_size') + an_u_2_0 = assigntrue.add_access("size1") + assigntrue.add_edge(an_u_2_0, None, an_2_0, "_write_size", dace.memlet.Memlet("size1")) + + an_2_1 = assignfalse.add_access('A') + an_2_1.add_in_connector('_write_size') + an_u_2_1 = assignfalse.add_access("size2") + assignfalse.add_edge(an_u_2_1, None, an_2_1, "_write_size", dace.memlet.Memlet("size2")) + + if storage_type == dace.dtypes.StorageType.CPU_Heap: + assert (schedule_type == dace.dtypes.ScheduleType.Sequential or schedule_type == dace.dtypes.ScheduleType.CPU_Multicore) + elif storage_type == dace.dtypes.StorageType.GPU_Global: + assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device) + elif storage_type == dace.dtypes.StorageType.CPU_Pinned: + assert (schedule_type == dace.dtypes.ScheduleType.Sequential) + + an_3 = state.add_access('A') + an_3.add_out_connector('_read_size') + map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,"__A_0-1",1)]), + "j":dace.subsets.Range([(0,"__A_1-1", 1)])}, + schedule=schedule_type) + state.add_edge(an_3, '_read_size', map_entry, "__A_0", dace.Memlet(expr="A_size[0]")) + state.add_edge(an_3, '_read_size', map_entry, "__A_1", dace.Memlet(expr="A_size[1]")) + map_entry.add_in_connector("__A_0") + map_entry.add_in_connector("__A_1") + map_exit.add_in_connector("IN_A") + map_exit.add_out_connector("OUT_A") + + t1 = state.add_tasklet(name="assign", inputs={}, outputs={"_out"}, code="_out=3.0") + state.add_edge(map_entry, None, t1, None, dace.Memlet(None)) + state.add_edge(t1, "_out", map_exit, "IN_A", dace.Memlet(expr="A[i, j]")) + + an_4 = state.add_access('A') + state.add_edge(map_exit, "OUT_A", an_4, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,"__A_0-1", 1), (0,"__A_1-1", 1)]))) + + an_4.add_out_connector('_read_size') + map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,"__A_0-1",1)]),"j":dace.subsets.Range([(0,"__A_1-1", 1)])}, + schedule=schedule_type) + state.add_edge(an_4, '_read_size', map_entry2, "__A_0", dace.Memlet(expr="A_size[0]")) + state.add_edge(an_4, '_read_size', map_entry2, "__A_1", dace.Memlet(expr="A_size[1]")) + state.add_edge(an_4, None, map_entry2, "IN_A", dace.Memlet(expr="A[0:__A_0, 0:__A_1]")) + map_entry2.add_in_connector("__A_0") + map_entry2.add_in_connector("__A_1") + map_entry2.add_in_connector("IN_A") + map_entry2.add_out_connector("OUT_A") + map_exit2.add_in_connector("IN_A") + map_exit2.add_out_connector("OUT_A") + + t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='_out = _in', language=dace.dtypes.Language.Python) + state.add_edge(map_entry2, "OUT_A", t2, "_in", dace.Memlet(expr="A[i, j]")) + state.add_edge(t2, "_out", map_exit2, "IN_A", dace.Memlet(expr="A[i, j]")) + + an_5 = state.add_access('A') + state.add_edge(map_exit2, "OUT_A", an_5, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,"__A_0-1", 1), (0,"__A_1-1", 1)]))) + + arr_name, arr = sdfg.add_array(name="example_array", dtype=dace.float32, shape=(1,), transient=False, storage=storage_type) + arrn = state.add_access(arr_name) + state.add_edge(an_5, None, arrn, None, dace.memlet.Memlet("A[0, 0]")) + + return sdfg + +@pytest.mark.gpu +def test_conditional_alloc_gpu(): + sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device) + sdfg.validate() + size1 = numpy.array([1, 1]).astype(numpy.uint64) + size2 = numpy.array([22, 22]).astype(numpy.uint64) + try: + import cupy + except Exception: + return + + arr = cupy.array([-1.0]).astype(cupy.float32) + sdfg(path=1, size1=size1, size2=size2, example_array=arr) + assert ( arr.get()[0] == 3.0 ) + +@pytest.mark.gpu +def test_conditional_alloc_cpu_pinned(): + sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Pinned, True, dace.dtypes.ScheduleType.Sequential) + sdfg.validate() + size1 = numpy.array([1, 1]).astype(numpy.uint64) + size2 = numpy.array([22, 22]).astype(numpy.uint64) + arr = numpy.array([-1.0]).astype(numpy.float32) + sdfg(path=1, size1=size1, size2=size2, example_array=arr) + assert ( arr[0] == 3.0 ) + +def test_conditional_alloc_cpu(): + sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore) + sdfg.validate() + size1 = numpy.array([1, 1]).astype(numpy.uint64) + size2 = numpy.array([22, 22]).astype(numpy.uint64) + arr = numpy.array([-1.0]).astype(numpy.float32) + sdfg(path=0, size1=size1, size2=size2, example_array=arr) + assert ( arr[0] == 3.0 ) + +@pytest.mark.gpu +def test_conditional_alloc_with_expr_gpu(): + sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device, True) + sdfg.validate() + size1 = numpy.array([1, 1]).astype(numpy.uint64) + size2 = numpy.array([22, 22]).astype(numpy.uint64) + try: + import cupy + except Exception: + return + + arr = cupy.array([-1.0]).astype(cupy.float32) + sdfg(path=1, size1=size1, size2=size2, example_array=arr) + assert ( arr.get()[0] == 3.0 ) + +@pytest.mark.gpu +def test_conditional_alloc_with_expr_cpu_pinned(): + sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Pinned, True, dace.dtypes.ScheduleType.Sequential, True) + sdfg.validate() + size1 = numpy.array([1, 1]).astype(numpy.uint64) + size2 = numpy.array([22, 22]).astype(numpy.uint64) + arr = numpy.array([-1.0]).astype(numpy.float32) + sdfg(path=1, size1=size1, size2=size2, example_array=arr) + assert ( arr[0] == 3.0 ) + +def test_conditional_alloc_with_expr_cpu(): + sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore, True) + sdfg.validate() + size1 = numpy.array([1, 1]).astype(numpy.uint64) + size2 = numpy.array([22, 22]).astype(numpy.uint64) + arr = numpy.array([-1.0]).astype(numpy.float32) + sdfg(path=0, size1=size1, size2=size2, example_array=arr) + assert ( arr[0] == 3.0 ) + +def test_incomplete_write_dimensions_1(): + sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2") + try: + sdfg.validate() + except Exception: + return + + pytest.fail("Realloc-use with transient data and incomplete write did not fail when it was expected to.") + +def test_incomplete_write_dimensions_2(): + sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, False, "1:2") + try: + sdfg.validate() + except Exception: + return + + pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.") + + +if __name__ == "__main__": + print(f"Trivial Realloc within map, cpu") + test_realloc_inside_map_cpu() + print(f"Trivial Realloc within map, gpu") + test_realloc_inside_map_gpu() + print(f"Trivial Realloc within map, cpu pinned") + test_realloc_inside_map_cpu_pinned() + + print(f"Trivial Realloc with storage, cpu") + test_trivial_realloc_cpu(True) + print(f"Trivial Realloc with storage, gpu") + test_trivial_realloc_gpu(True) + print(f"Trivial Realloc with storage, cpu pinned") + test_trivial_realloc_cpu_pinned(True) + + print(f"Trivial Realloc with storage, cpu, on non-transient data") + test_trivial_realloc_cpu(False) + print(f"Trivial Realloc-Use with storage, gpu, on non-transient data") + test_trivial_realloc_gpu(False) + print(f"Trivial Realloc with storage, cpu pinned, on non-transient data") + test_trivial_realloc_cpu_pinned(False) + + print(f"Trivial Realloc-Use with storage, cpu, transient") + test_realloc_use_cpu(True) + print(f"Trivial Realloc-Use with storage, gpu, transient") + test_realloc_use_gpu(True) + print(f"Trivial Realloc-Use with storage, cpu pinned, transient") + test_realloc_use_cpu_pinned(True) + + print(f"Trivial Realloc-Use with storage, cpu, non-transient") + test_realloc_use_cpu(False) + print(f"Trivial Realloc-Use with storage, gpu, non-transient") + test_realloc_use_gpu(False) + print(f"Trivial Realloc-Use with storage, cpu pinned, non-transient") + test_realloc_use_cpu_pinned(False) + + print(f"Realloc with incomplete write one, validation") + test_incomplete_write_dimensions_1() + print(f"Realloc with incomplete write two, validation") + test_incomplete_write_dimensions_2() + + print(f"Test conditional alloc with use, cpu") + test_conditional_alloc_cpu() + print(f"Test conditional alloc with use, gpu") + test_conditional_alloc_gpu() + print(f"Test conditional alloc with use, cpu pinned") + test_conditional_alloc_cpu_pinned() + + print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu") + test_conditional_alloc_with_expr_cpu() + print(f"Test conditional alloc with use and the shape as a non-trivial expression, gpu") + test_conditional_alloc_with_expr_gpu() + print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu pinned") + test_conditional_alloc_with_expr_cpu_pinned() diff --git a/tests/passes/array_elimination_test.py b/tests/passes/array_elimination_test.py index f20428fda3..13d70cdead 100644 --- a/tests/passes/array_elimination_test.py +++ b/tests/passes/array_elimination_test.py @@ -30,7 +30,7 @@ def tester(A: dace.float64[20], B: dace.float64[20]): sdutil.inline_sdfgs(sdfg) sdutil.fuse_states(sdfg) Pipeline([ArrayElimination()]).apply_pass(sdfg, {}) - assert len(sdfg.arrays) == 4 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4 def test_merge_simple(): diff --git a/tests/passes/lift_struct_views_test.py b/tests/passes/lift_struct_views_test.py index 71f19215b5..81a05d5f5c 100644 --- a/tests/passes/lift_struct_views_test.py +++ b/tests/passes/lift_struct_views_test.py @@ -25,14 +25,14 @@ def test_simple_tasklet_access(): state.add_edge(t1, 'o1', write, None, dace.Memlet('Z')) assert len(state.nodes()) == 3 - assert len(sdfg.arrays) == 2 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2 res = LiftStructViews().apply_pass(sdfg, {}) assert len(res['A']) == 1 assert len(res['Z']) == 1 assert len(state.nodes()) == 5 - assert len(sdfg.arrays) == 4 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4 assert sdfg.is_valid() @@ -55,14 +55,14 @@ def test_sliced_tasklet_access(): state.add_edge(t1, 'o1', write, None, dace.Memlet('Z')) assert len(state.nodes()) == 3 - assert len(sdfg.arrays) == 2 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2 res = LiftStructViews().apply_pass(sdfg, {}) assert len(res['A']) == 1 assert len(res['Z']) == 1 assert len(state.nodes()) == 5 - assert len(sdfg.arrays) == 4 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4 assert sdfg.is_valid() @@ -91,12 +91,12 @@ def test_sliced_multi_tasklet_access(): state.add_edge(t1, 'o1', write, None, dace.Memlet('Z')) assert len(state.nodes()) == 3 - assert len(sdfg.arrays) == 2 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2 FixedPointPipeline([LiftStructViews()]).apply_pass(sdfg, {}) assert len(state.nodes()) == 9 - assert len(sdfg.arrays) == 8 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 8 assert sdfg.is_valid() @@ -121,12 +121,12 @@ def test_tasklet_access_to_cont_array(): state.add_edge(t1, 'o1', write, None, dace.Memlet('Z[0]')) assert len(state.nodes()) == 3 - assert len(sdfg.arrays) == 2 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2 FixedPointPipeline([LiftStructViews()]).apply_pass(sdfg, {}) assert len(state.nodes()) == 7 - assert len(sdfg.arrays) == 6 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 6 assert sdfg.is_valid() @@ -157,12 +157,12 @@ def test_sliced_multi_tasklet_access_to_cont_array(): state.add_edge(t1, 'o1', write, None, dace.Memlet('Z[0]')) assert len(state.nodes()) == 3 - assert len(sdfg.arrays) == 2 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2 FixedPointPipeline([LiftStructViews()]).apply_pass(sdfg, {}) assert len(state.nodes()) == 11 - assert len(sdfg.arrays) == 10 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 10 assert sdfg.is_valid() diff --git a/tests/passes/scalar_fission_test.py b/tests/passes/scalar_fission_test.py index f8c59b8f4d..74d64555d5 100644 --- a/tests/passes/scalar_fission_test.py +++ b/tests/passes/scalar_fission_test.py @@ -108,7 +108,7 @@ def test_scalar_fission(with_raising): # Both interstate edges should be different now. assert tmp1_edge.assignments != tmp2_edge.assignments # There should now be 5 arrays in the SDFG, i.e. 2 more than before since two isolated scopes of tmp exist. - assert len(sdfg.arrays.keys()) == 5 + assert len(sdfg.arrays.keys()) - len(sdfg.size_arrays()) == 5 # Assert all accesses per scope are identical. assert all([n.data == list(tmp1_edge.assignments.values())[0] for n in [tmp1_write, loop1_read_tmp]]) assert all([n.data == list(tmp2_edge.assignments.values())[0] for n in [tmp2_write, loop2_read_tmp]]) @@ -197,7 +197,7 @@ def test_branch_subscopes_nofission(with_raising): Pipeline([ScalarFission()]).apply_pass(sdfg, {}) - assert set(sdfg.arrays.keys()) == {'A', 'B', 'C'} + assert set(sdfg.arrays.keys()).difference(sdfg.size_arrays()) == {'A', 'B', 'C'} @pytest.mark.parametrize('with_raising', (False, True)) def test_branch_subscopes_fission(with_raising): @@ -293,7 +293,7 @@ def test_branch_subscopes_fission(with_raising): Pipeline([ScalarFission()]).apply_pass(sdfg, {}) - assert set(sdfg.arrays.keys()) == {'A', 'B', 'C', 'B_0', 'B_1'} + assert set(sdfg.arrays.keys()).difference(set(sdfg.size_arrays())) == {'A', 'B', 'C', 'B_0', 'B_1'} if __name__ == '__main__': test_scalar_fission(False) diff --git a/tests/sdfg/cutout_test.py b/tests/sdfg/cutout_test.py index 151c3cab47..0049e460b5 100644 --- a/tests/sdfg/cutout_test.py +++ b/tests/sdfg/cutout_test.py @@ -21,7 +21,7 @@ def simple_matmul(A: dace.float64[20, 20], B: dace.float64[20, 20]): cut_sdfg = SDFGCutout.singlestate_cutout(state, node) assert cut_sdfg.number_of_nodes() == 1 assert cut_sdfg.node(0).number_of_nodes() == 4 - assert len(cut_sdfg.arrays) == 3 + assert len(set(cut_sdfg.arrays.keys()).difference(set(cut_sdfg.size_arrays()))) == 3 assert all(not a.transient for a in cut_sdfg.arrays.values()) @@ -42,7 +42,7 @@ def simple_matmul(A: dace.float64[20, 20], B: dace.float64[20, 20]): cut_sdfg = SDFGCutout.singlestate_cutout(state, *nodes) assert cut_sdfg.number_of_nodes() == 1 assert cut_sdfg.node(0).number_of_nodes() == 7 - assert len(cut_sdfg.arrays) == 5 + assert len(set(cut_sdfg.arrays.keys()).difference(set(cut_sdfg.size_arrays()))) == 5 assert (not any(a.transient for a in cut_sdfg.arrays.values())) @@ -309,7 +309,7 @@ def test_input_output_configuration(): assert ct.arrays['tmp2'].transient == False assert ct.arrays['tmp3'].transient == True assert ct.arrays['tmp4'].transient == True - assert len(ct.arrays) == 4 + assert len(set(ct.arrays.keys()).difference(set(ct.size_arrays()))) == 4 def test_minimum_cut_simple_no_further_input_config(): diff --git a/tests/transformations/redundant_copy_test.py b/tests/transformations/redundant_copy_test.py index 280d5f182a..583b76b8bf 100644 --- a/tests/transformations/redundant_copy_test.py +++ b/tests/transformations/redundant_copy_test.py @@ -74,7 +74,7 @@ def make_sdfg() -> Tuple[dace.SDFG, dace.nodes.AccessNode, dace.nodes.AccessNode ) sdfg.validate() assert state.number_of_nodes() == 4 - assert len(sdfg.arrays) == 4 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4 return sdfg, a_an, b_an, output_an def apply_trafo( @@ -89,7 +89,7 @@ def apply_trafo( candidate = {type(trafo).in_array: in_array, type(trafo).out_array: out_array} state = sdfg.start_block state_id = sdfg.node_id(state) - initial_arrays = len(sdfg.arrays) + initial_arrays = len(sdfg.arrays) - len(sdfg.size_arrays()) initial_access_nodes = state.number_of_nodes() trafo.setup_match(sdfg, sdfg.cfg_id, state_id, candidate, 0, override=True) @@ -101,7 +101,7 @@ def apply_trafo( assert False, f"A view was created instead removing '{in_array.data}'." sdfg.validate() assert state.number_of_nodes() == initial_access_nodes - 1 - assert len(sdfg.arrays) == initial_arrays - 1 + assert len(sdfg.arrays) - len(sdfg.size_arrays()) == initial_arrays - 1 assert in_array.data not in sdfg.arrays return sdfg