From b10475ff97d8cf218c7e06f73b5d5a2627e04e1f Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 23 Oct 2024 14:36:54 +0200
Subject: [PATCH 01/51] Early changes to support reallocation for CPU_Heap
 storage

---
 dace/codegen/dispatcher.py  | 40 +++++++++++++++++--
 dace/codegen/targets/cpu.py | 76 ++++++++++++++++++++++++++-----------
 2 files changed, 91 insertions(+), 25 deletions(-)

diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
index 9bec33b4ef..b05564bbe4 100644
--- a/dace/codegen/dispatcher.py
+++ b/dace/codegen/dispatcher.py
@@ -20,7 +20,7 @@
 @registry.extensible_enum
 class DefinedType(aenum.AutoNumberEnum):
     """ Data types for `DefinedMemlets`.
-    
+
         :see: DefinedMemlets
     """
     Pointer = ()  # Pointer
@@ -159,6 +159,8 @@ class TargetDispatcher(object):
     _state_dispatchers: List[Tuple[Callable, target.TargetCodeGenerator]]
     _generic_state_dispatcher: Optional[target.TargetCodeGenerator]
 
+    _generic_reallocate_dispatcher: Dict[dtypes.StorageType, target.TargetCodeGenerator]
+
     _declared_arrays: DefinedMemlets
     _defined_vars: DefinedMemlets
 
@@ -181,6 +183,7 @@ def __init__(self, framecode):
         self._node_dispatchers = []
         self._generic_node_dispatcher = None
         self._state_dispatchers = []
+        self._generic_reallocate_dispatchers = {}
         self._generic_state_dispatcher = None
 
         self._declared_arrays = DefinedMemlets()
@@ -189,7 +192,7 @@ def __init__(self, framecode):
     @property
     def declared_arrays(self) -> DefinedMemlets:
         """ Returns a list of declared variables.
-        
+
             This is used for variables that must have their declaration and
             allocation separate. It includes all such variables that have been
             declared by the dispatcher.
@@ -199,7 +202,7 @@ def declared_arrays(self) -> DefinedMemlets:
     @property
     def defined_vars(self) -> DefinedMemlets:
         """ Returns a list of defined variables.
-        
+
             This includes all variables defined by the dispatcher.
         """
         return self._defined_vars
@@ -354,6 +357,16 @@ def register_copy_dispatcher(self, src_storage: dtypes.StorageType, dst_storage:
 
         self._copy_dispatchers[dispatcher].append((predicate, func))
 
+    def register_reallocate_dispatcher(self, node_storage: dtypes.StorageType,
+                                      func: target.TargetCodeGenerator,
+                                      predicate: Optional[Callable] = None) -> None:
+
+        if not isinstance(node_storage, dtypes.StorageType): raise TypeError(node_storage, dtypes.StorageType, isinstance(node_storage, dtypes.StorageType))
+
+        dispatcher = (node_storage)
+        self._generic_copy_dispatchers[dispatcher] = func
+        return
+
     def get_state_dispatcher(self, sdfg: SDFG, state: SDFGState) -> target.TargetCodeGenerator:
         # Check if the state satisfies any predicates that delegate to a
         # specific code generator
@@ -594,6 +607,14 @@ def get_copy_dispatcher(self, src_node: Union[nodes.CodeNode, nodes.AccessNode],
 
         return target
 
+    def get_reallocate_dispatcher(self, node: Union[nodes.CodeNode, nodes.AccessNode],
+                            edge: MultiConnectorEdge[Memlet],
+                            sdfg: SDFG, state: SDFGState) -> Optional[target.TargetCodeGenerator]:
+        node_storage = sdfg.arrays[node.data].storage
+        target = self._generic_reallocate_dispatchers[node_storage]
+        return target
+
+
     def dispatch_copy(self, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], sdfg: SDFG,
                       cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, function_stream: CodeIOStream,
                       output_stream: CodeIOStream) -> None:
@@ -609,6 +630,19 @@ def dispatch_copy(self, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiC
         self._used_targets.add(target)
         target.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, function_stream, output_stream)
 
+    def dispatch_reallocate(self, node: nodes.Node, edge: MultiConnectorEdge[Memlet], sdfg: SDFG,
+                      cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, function_stream: CodeIOStream,
+                      output_stream: CodeIOStream) -> None:
+        state = cfg.state(state_id)
+        target = self.get_reallocate_dispatcher(node, edge, sdfg, state)
+        if target is None:
+            return
+
+        # Dispatch copy
+        self._used_targets.add(target)
+        target.reallocate(sdfg, cfg, dfg, state_id, node, edge, function_stream, output_stream)
+
+
     # Dispatches definition code for a memlet that is outgoing from a tasklet
     def dispatch_output_definition(self, src_node: nodes.Node, dst_node: nodes.Node, edge, sdfg: SDFG,
                                    cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 51daaa432b..1269e35134 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -122,6 +122,8 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
         for src_storage, dst_storage in itertools.product(cpu_storage, cpu_storage):
             dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self)
 
+        dispatcher.register_reallocate_dispatcher(dtypes.StorageType.CPU_Heap, None, self)
+
     @staticmethod
     def cmake_options():
         options = []
@@ -399,6 +401,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
         if isinstance(nodedesc, data.Structure) and not isinstance(nodedesc, data.StructureView):
             declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type};\n")
+            declaration_stream.write(f"// C")
             define_var(name, DefinedType.Pointer, nodedesc.ctype)
             if allocate_nested_data:
                 for k, v in nodedesc.members.items():
@@ -493,9 +496,26 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
             if not declared:
                 declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node)
-            allocation_stream.write(
-                "%s = new %s DACE_ALIGN(64)[%s];\n" % (alloc_name, nodedesc.dtype.ctype, cpp.sym2cpp(arrsize)), cfg,
-                state_id, node)
+            if isinstance(arrsize, symbolic.symbol) and str(arrsize) == "__dace_defer":
+                allocation_stream.write("// Deferred Allocation")
+                allocation_stream.write(
+                    "%s = nullptr;" %
+                    (alloc_name,),
+                    cfg,
+                    state_id,
+                    node
+                )
+            else:
+                allocation_stream.write(
+                    "%s = new %s DACE_ALIGN(64)[%s];\n" %
+                    (alloc_name,
+                     nodedesc.dtype.ctype,
+                     cpp.sym2cpp(arrsize)),
+                    cfg,
+                    state_id,
+                    node
+                )
+
             define_var(name, DefinedType.Pointer, ctypedef)
 
             if node.setzero:
@@ -2155,28 +2175,40 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
 
         sdict = state_dfg.scope_dict()
         for edge in state_dfg.in_edges(node):
-            predecessor, _, _, _, memlet = edge
+            predecessor, _, dst, in_connector, memlet = edge
             if memlet.data is None:
                 continue  # If the edge has to be skipped
 
-            # Determines if this path ends here or has a definite source (array) node
-            memlet_path = state_dfg.memlet_path(edge)
-            if memlet_path[-1].dst == node:
-                src_node = memlet_path[0].src
-                # Only generate code in case this is the innermost scope
-                # (copies are generated at the inner scope, where both arrays exist)
-                if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]):
-                    self._dispatcher.dispatch_copy(
-                        src_node,
-                        node,
-                        edge,
-                        sdfg,
-                        cfg,
-                        dfg,
-                        state_id,
-                        function_stream,
-                        callsite_stream,
-                    )
+            if in_connector == "IN_size":
+                self._dispatcher.dispatch_reallocate(
+                    node,
+                    edge,
+                    sdfg,
+                    cfg,
+                    dfg,
+                    state_id,
+                    function_stream,
+                    callsite_stream,
+                )
+            else:
+                # Determines if this path ends here or has a definite source (array) node
+                memlet_path = state_dfg.memlet_path(edge)
+                if memlet_path[-1].dst == node:
+                    src_node = memlet_path[0].src
+                    # Only generate code in case this is the innermost scope
+                    # (copies are generated at the inner scope, where both arrays exist)
+                    if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]):
+                        self._dispatcher.dispatch_copy(
+                            src_node,
+                            node,
+                            edge,
+                            sdfg,
+                            cfg,
+                            dfg,
+                            state_id,
+                            function_stream,
+                            callsite_stream,
+                        )
 
         # Process outgoing memlets (array-to-array write should be emitted
         # from the first leading edge out of the array)

From fae07041c0225f04d9261c3d9c4d5fcbead2acc7 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 23 Oct 2024 15:03:12 +0200
Subject: [PATCH 02/51] Minimal functioning realloc

---
 dace/codegen/dispatcher.py  | 10 +++++-----
 dace/codegen/targets/cpu.py | 30 ++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
index b05564bbe4..59f472d57f 100644
--- a/dace/codegen/dispatcher.py
+++ b/dace/codegen/dispatcher.py
@@ -159,7 +159,7 @@ class TargetDispatcher(object):
     _state_dispatchers: List[Tuple[Callable, target.TargetCodeGenerator]]
     _generic_state_dispatcher: Optional[target.TargetCodeGenerator]
 
-    _generic_reallocate_dispatcher: Dict[dtypes.StorageType, target.TargetCodeGenerator]
+    _generic_reallocate_dispatchers: Dict[dtypes.StorageType, target.TargetCodeGenerator]
 
     _declared_arrays: DefinedMemlets
     _defined_vars: DefinedMemlets
@@ -362,9 +362,8 @@ def register_reallocate_dispatcher(self, node_storage: dtypes.StorageType,
                                       predicate: Optional[Callable] = None) -> None:
 
         if not isinstance(node_storage, dtypes.StorageType): raise TypeError(node_storage, dtypes.StorageType, isinstance(node_storage, dtypes.StorageType))
-
-        dispatcher = (node_storage)
-        self._generic_copy_dispatchers[dispatcher] = func
+        dispatcher = node_storage
+        self._generic_reallocate_dispatchers[dispatcher] = func
         return
 
     def get_state_dispatcher(self, sdfg: SDFG, state: SDFGState) -> target.TargetCodeGenerator:
@@ -635,10 +634,11 @@ def dispatch_reallocate(self, node: nodes.Node, edge: MultiConnectorEdge[Memlet]
                       output_stream: CodeIOStream) -> None:
         state = cfg.state(state_id)
         target = self.get_reallocate_dispatcher(node, edge, sdfg, state)
+        assert target is not None
         if target is None:
             return
 
-        # Dispatch copy
+        # Dispatch reallocate
         self._used_targets.add(target)
         target.reallocate(sdfg, cfg, dfg, state_id, node, edge, function_stream, output_stream)
 
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 1269e35134..eb33c60abc 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -122,7 +122,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
         for src_storage, dst_storage in itertools.product(cpu_storage, cpu_storage):
             dispatcher.register_copy_dispatcher(src_storage, dst_storage, None, self)
 
-        dispatcher.register_reallocate_dispatcher(dtypes.StorageType.CPU_Heap, None, self)
+        dispatcher.register_reallocate_dispatcher(dtypes.StorageType.CPU_Heap, self)
 
     @staticmethod
     def cmake_options():
@@ -666,6 +666,24 @@ def copy_memory(
             callsite_stream,
         )
 
+    def reallocate(
+        self,
+        sdfg: SDFG,
+        cfg: ControlFlowRegion,
+        dfg: StateSubgraphView,
+        state_id: int,
+        node: Union[nodes.Tasklet, nodes.AccessNode],
+        edge: Tuple[nodes.Node, Optional[str], nodes.Node, Optional[str], mmlt.Memlet],
+        function_stream: CodeIOStream,
+        callsite_stream: CodeIOStream,
+    ):
+        callsite_stream.write(
+            f"// Reallocate Called"
+        )
+        dtype = sdfg.arrays[node.data].dtype
+        callsite_stream.write(
+            f"{node.data} = realloc({node.data}, {edge.data.data} * sizeof({dtype}));"
+        )
 
     def _emit_copy(
         self,
@@ -1122,7 +1140,13 @@ def process_out_memlets(self,
 
             # Dispatch array-to-array outgoing copies here
             elif isinstance(node, nodes.AccessNode):
-                if dst_node != node and not isinstance(dst_node, nodes.Tasklet):
+                if dst_node != node and not isinstance(dst_node, nodes.Tasklet) :
+                    # If it is a size change, reallocate will be called
+                    if edge.dst_conn.endswith("_size"):
+                        result.write("// No Copy as AN -> AN write is to ")
+                        continue
+
+                    result.write("// COPY2")
                     dispatcher.dispatch_copy(
                         node,
                         dst_node,
@@ -1435,6 +1459,7 @@ def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgra
                     self._dispatcher.defined_vars.add(edge.dst_conn, defined_type, f"const {ctype}")
 
                 else:
+                    inner_stream.write("// COPY3")
                     self._dispatcher.dispatch_copy(
                         src_node,
                         node,
@@ -2198,6 +2223,7 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
                     # Only generate code in case this is the innermost scope
                     # (copies are generated at the inner scope, where both arrays exist)
                     if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]):
+                        callsite_stream.write("// COPY1")
                         self._dispatcher.dispatch_copy(
                             src_node,
                             node,

From 023c86c509436c7abe88e9095a6bf6e9148b2e63 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Thu, 24 Oct 2024 11:07:41 +0200
Subject: [PATCH 03/51] Add first prototype of deferred allocation support

---
 dace/codegen/targets/cpu.py | 24 +++++++++----------
 dace/sdfg/sdfg.py           | 47 +++++++++++++++++++++++++++----------
 2 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index eb33c60abc..19884fb88d 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -395,13 +395,13 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
         # Compute array size
         arrsize = nodedesc.total_size
+        deferred_allocation = any([s for s in nodedesc.shape if str(s).startswith("__dace_defer")])
         arrsize_bytes = None
         if not isinstance(nodedesc.dtype, dtypes.opaque):
             arrsize_bytes = arrsize * nodedesc.dtype.bytes
 
         if isinstance(nodedesc, data.Structure) and not isinstance(nodedesc, data.StructureView):
             declaration_stream.write(f"{nodedesc.ctype} {name} = new {nodedesc.dtype.base_type};\n")
-            declaration_stream.write(f"// C")
             define_var(name, DefinedType.Pointer, nodedesc.ctype)
             if allocate_nested_data:
                 for k, v in nodedesc.members.items():
@@ -496,10 +496,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
             if not declared:
                 declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node)
-            if isinstance(arrsize, symbolic.symbol) and str(arrsize) == "__dace_defer":
-                allocation_stream.write("// Deferred Allocation")
+            if deferred_allocation:
                 allocation_stream.write(
-                    "%s = nullptr;" %
+                    "%s = nullptr; // Deferred Allocation" %
                     (alloc_name,),
                     cfg,
                     state_id,
@@ -677,12 +676,16 @@ def reallocate(
         function_stream: CodeIOStream,
         callsite_stream: CodeIOStream,
     ):
-        callsite_stream.write(
-            f"// Reallocate Called"
+        function_stream.write(
+            "#include <cstdlib>"
         )
-        dtype = sdfg.arrays[node.data].dtype
+        data_name = node.data
+        size_array_name = f"{data_name}_size"
+        data = sdfg.arrays[data_name]
+        dtype = sdfg.arrays[data_name].dtype
+        size_str = " * ".join([f"{size_array_name}[{i}]" for i in range(len(data.shape))])
         callsite_stream.write(
-            f"{node.data} = realloc({node.data}, {edge.data.data} * sizeof({dtype}));"
+            f"{node.data} = static_cast<{dtype} *>(std::realloc(static_cast<void *>({node.data}), {size_str} * sizeof({dtype})));"
         )
 
     def _emit_copy(
@@ -1142,11 +1145,9 @@ def process_out_memlets(self,
             elif isinstance(node, nodes.AccessNode):
                 if dst_node != node and not isinstance(dst_node, nodes.Tasklet) :
                     # If it is a size change, reallocate will be called
-                    if edge.dst_conn.endswith("_size"):
-                        result.write("// No Copy as AN -> AN write is to ")
+                    if edge.dst_conn is not None and edge.dst_conn.endswith("_size"):
                         continue
 
-                    result.write("// COPY2")
                     dispatcher.dispatch_copy(
                         node,
                         dst_node,
@@ -2223,7 +2224,6 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
                     # Only generate code in case this is the innermost scope
                     # (copies are generated at the inner scope, where both arrays exist)
                     if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]):
-                        callsite_stream.write("// COPY1")
                         self._dispatcher.dispatch_copy(
                             src_node,
                             node,
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 38a41236a6..f91e62df97 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1032,7 +1032,7 @@ def clear_data_reports(self):
 
     def call_with_instrumented_data(self, dreport: 'InstrumentedDataReport', *args, **kwargs):
         """
-        Invokes an SDFG with an instrumented data report, generating and compiling code if necessary. 
+        Invokes an SDFG with an instrumented data report, generating and compiling code if necessary.
         Arguments given as ``args`` and ``kwargs`` will be overriden by the data containers defined in the report.
 
         :param dreport: The instrumented data report to use upon calling.
@@ -1694,21 +1694,28 @@ def add_array(self,
                   may_alias=False) -> Tuple[str, dt.Array]:
         """ Adds an array to the SDFG data descriptor store. """
 
-        # convert strings to int if possible
+
+        # Every Array also supports reallocation, we need to create a secondary size array
+        # The array size is constant and not changeable, yet the values in the array can change
+
+        # convert strings to int if possible, unless it is not the reserved symbol for deferred allocation
         newshape = []
-        for s in shape:
-            try:
-                newshape.append(int(s))
-            except:
-                newshape.append(dace.symbolic.pystr_to_symbolic(s))
+        for i, s in enumerate(shape):
+            if isinstance(s, str) and s == "__dace_defer":
+                newshape.append(dace.symbolic.pystr_to_symbolic(f"{s}_dim{i}"))
+            else:
+                try:
+                    newshape.append(int(s))
+                except:
+                    newshape.append(dace.symbolic.pystr_to_symbolic(s))
         shape = newshape
         strides = strides or None
 
         if isinstance(dtype, type) and dtype in dtypes._CONSTANT_TYPES[:-1]:
             dtype = dtypes.typeclass(dtype)
 
-        desc = dt.Array(dtype,
-                        shape,
+        desc = dt.Array(dtype=dtype,
+                        shape=shape,
                         storage=storage,
                         location=location,
                         allow_conflicts=allow_conflicts,
@@ -1721,7 +1728,23 @@ def add_array(self,
                         total_size=total_size,
                         may_alias=may_alias)
 
-        return self.add_datadesc(name, desc, find_new_name=find_new_name), desc
+        size_desc = dt.Array(dtype=dace.uint64,
+                            shape=(len(shape),),
+                            storage=storage,
+                            location=location,
+                            allow_conflicts=False,
+                            transient=True,
+                            strides=(1,),
+                            offset=(0,),
+                            lifetime=lifetime,
+                            alignment=alignment,
+                            debuginfo=debuginfo,
+                            total_size=len(shape),
+                            may_alias=False)
+
+        array_name = self.add_datadesc(name, desc, find_new_name=find_new_name)
+        self.add_datadesc(f"{array_name}_size", size_desc, find_new_name=False)
+        return array_name, desc
 
     def add_view(self,
                  name: str,
@@ -2602,7 +2625,7 @@ def apply_transformations_once_everywhere(self,
                                               print_report: Optional[bool] = None,
                                               order_by_transformation: bool = True,
                                               progress: Optional[bool] = None) -> int:
-        """ 
+        """
         This function applies a transformation or a set of (unique) transformations
         until throughout the entire SDFG once. Operates in-place.
 
@@ -2718,7 +2741,7 @@ def expand_library_nodes(self, recursive=True):
 
     def generate_code(self):
         """ Generates code from this SDFG and returns it.
-        
+
             :return: A list of `CodeObject` objects containing the generated
                       code of different files and languages.
         """

From 4aca5ee723078a62eaf97baf5e8ff69336444b2f Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 29 Oct 2024 16:31:24 +0100
Subject: [PATCH 04/51] Add reading the size of array, add size input as a
 special in connector

---
 dace/codegen/dispatcher.py  |  4 +--
 dace/codegen/targets/cpu.py | 62 ++++++++++++++++++++++++-------------
 dace/sdfg/state.py          | 15 ++++-----
 dace/sdfg/validation.py     | 30 ++++++++++++------
 4 files changed, 72 insertions(+), 39 deletions(-)

diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
index 59f472d57f..2defa04680 100644
--- a/dace/codegen/dispatcher.py
+++ b/dace/codegen/dispatcher.py
@@ -629,7 +629,7 @@ def dispatch_copy(self, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiC
         self._used_targets.add(target)
         target.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, function_stream, output_stream)
 
-    def dispatch_reallocate(self, node: nodes.Node, edge: MultiConnectorEdge[Memlet], sdfg: SDFG,
+    def dispatch_reallocate(self, src_node: nodes.Node, node: nodes.Node, edge: MultiConnectorEdge[Memlet], sdfg: SDFG,
                       cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, function_stream: CodeIOStream,
                       output_stream: CodeIOStream) -> None:
         state = cfg.state(state_id)
@@ -640,7 +640,7 @@ def dispatch_reallocate(self, node: nodes.Node, edge: MultiConnectorEdge[Memlet]
 
         # Dispatch reallocate
         self._used_targets.add(target)
-        target.reallocate(sdfg, cfg, dfg, state_id, node, edge, function_stream, output_stream)
+        target.reallocate(sdfg, cfg, dfg, state_id, src_node, node, edge, function_stream, output_stream)
 
 
     # Dispatches definition code for a memlet that is outgoing from a tasklet
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 19884fb88d..c11b94d9cd 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -496,6 +496,10 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
             if not declared:
                 declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node)
+                # Initialize size array
+                size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
+                size_nodedesc = sdfg.arrays[f"{name}_size"]
+                declaration_stream.write(f'{size_nodedesc.dtype.ctype} {name}_size[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
             if deferred_allocation:
                 allocation_stream.write(
                     "%s = nullptr; // Deferred Allocation" %
@@ -515,7 +519,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                     node
                 )
 
+
             define_var(name, DefinedType.Pointer, ctypedef)
+            define_var(name + "_size", DefinedType.Pointer, size_nodedesc.dtype.ctype)
 
             if node.setzero:
                 allocation_stream.write("memset(%s, 0, sizeof(%s)*%s);" %
@@ -671,7 +677,8 @@ def reallocate(
         cfg: ControlFlowRegion,
         dfg: StateSubgraphView,
         state_id: int,
-        node: Union[nodes.Tasklet, nodes.AccessNode],
+        src_node: nodes.AccessNode,
+        dst_node: nodes.AccessNode,
         edge: Tuple[nodes.Node, Optional[str], nodes.Node, Optional[str], mmlt.Memlet],
         function_stream: CodeIOStream,
         callsite_stream: CodeIOStream,
@@ -679,13 +686,26 @@ def reallocate(
         function_stream.write(
             "#include <cstdlib>"
         )
-        data_name = node.data
+        data_name = dst_node.data
         size_array_name = f"{data_name}_size"
+        new_size_array_name = src_node.data
+
         data = sdfg.arrays[data_name]
+        new_size_array = sdfg.arrays[new_size_array_name]
         dtype = sdfg.arrays[data_name].dtype
+
+        # Only consider the offsets with __dace_defer in original dim
+        mask_array = [str(dim).startswith("__dace_defer") for dim in data.shape]
+        for i, mask in enumerate(mask_array):
+            if mask:
+                callsite_stream.write(
+                    f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
+                )
+
+        # Call realloc only after no __dace_defer is left in size_array ?
         size_str = " * ".join([f"{size_array_name}[{i}]" for i in range(len(data.shape))])
         callsite_stream.write(
-            f"{node.data} = static_cast<{dtype} *>(std::realloc(static_cast<void *>({node.data}), {size_str} * sizeof({dtype})));"
+            f"{dst_node.data} = static_cast<{dtype} *>(std::realloc(static_cast<void *>({dst_node.data}), {size_str} * sizeof({dtype})));"
         )
 
     def _emit_copy(
@@ -1145,7 +1165,7 @@ def process_out_memlets(self,
             elif isinstance(node, nodes.AccessNode):
                 if dst_node != node and not isinstance(dst_node, nodes.Tasklet) :
                     # If it is a size change, reallocate will be called
-                    if edge.dst_conn is not None and edge.dst_conn.endswith("_size"):
+                    if edge.dst_conn is not None and edge.dst_conn == "IN_size":
                         continue
 
                     dispatcher.dispatch_copy(
@@ -1460,7 +1480,6 @@ def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgra
                     self._dispatcher.defined_vars.add(edge.dst_conn, defined_type, f"const {ctype}")
 
                 else:
-                    inner_stream.write("// COPY3")
                     self._dispatcher.dispatch_copy(
                         src_node,
                         node,
@@ -2205,22 +2224,23 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
             if memlet.data is None:
                 continue  # If the edge has to be skipped
 
-            if in_connector == "IN_size":
-                self._dispatcher.dispatch_reallocate(
-                    node,
-                    edge,
-                    sdfg,
-                    cfg,
-                    dfg,
-                    state_id,
-                    function_stream,
-                    callsite_stream,
-                )
-            else:
-                # Determines if this path ends here or has a definite source (array) node
-                memlet_path = state_dfg.memlet_path(edge)
-                if memlet_path[-1].dst == node:
-                    src_node = memlet_path[0].src
+            # Determines if this path ends here or has a definite source (array) node
+            memlet_path = state_dfg.memlet_path(edge)
+            if memlet_path[-1].dst == node:
+                src_node = memlet_path[0].src
+                if in_connector == "IN_size":
+                    self._dispatcher.dispatch_reallocate(
+                        src_node,
+                        node,
+                        edge,
+                        sdfg,
+                        cfg,
+                        dfg,
+                        state_id,
+                        function_stream,
+                        callsite_stream,
+                    )
+                else:
                     # Only generate code in case this is the innermost scope
                     # (copies are generated at the inner scope, where both arrays exist)
                     if (scope_contains_scope(sdict, src_node, node) and sdict[src_node] != sdict[node]):
diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index 2ae6109b31..ebb96d9735 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -243,7 +243,7 @@ def read_and_write_sets(self) -> Tuple[Set[AnyStr], Set[AnyStr]]:
         """
         Determines what data is read and written in this graph.
         Does not include reads to subsets of containers that have previously been written within the same state.
-        
+
         :return: A two-tuple of sets of things denoting ({data read}, {data written}).
         """
         return set(), set()
@@ -421,7 +421,8 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto
             # Trace through scope entry using IN_# -> OUT_#
             if isinstance(curedge.dst, (nd.EntryNode, nd.ExitNode)):
                 if curedge.dst_conn is None:
-                    raise ValueError("Destination connector cannot be None for {}".format(curedge.dst))
+                    #raise ValueError("Destination connector cannot be None for {}".format(curedge.dst))
+                    break
                 if not curedge.dst_conn.startswith("IN_"):  # Map variable
                     break
                 next_edge = next(e for e in state.out_edges(curedge.dst) if e.src_conn == "OUT_" + curedge.dst_conn[3:])
@@ -794,7 +795,7 @@ def _read_and_write_sets(self) -> Tuple[Dict[AnyStr, List[Subset]], Dict[AnyStr,
     def read_and_write_sets(self) -> Tuple[Set[AnyStr], Set[AnyStr]]:
         """
         Determines what data is read and written in this subgraph.
-        
+
         :return: A two-tuple of sets of things denoting
                  ({data read}, {data written}).
         """
@@ -2595,7 +2596,7 @@ def inline(self) -> Tuple[bool, Any]:
             for b_edge in parent.in_edges(self):
                 parent.add_edge(b_edge.src, self.start_block, b_edge.data)
                 parent.remove_edge(b_edge)
-            
+
             end_state = None
             if len(to_connect) > 0:
                 end_state = parent.add_state(self.label + '_end')
@@ -3262,7 +3263,7 @@ def nodes(self) -> List['ControlFlowBlock']:
 
     def edges(self) -> List[Edge['dace.sdfg.InterstateEdge']]:
         return []
-    
+
     def _used_symbols_internal(self,
                                all_symbols: bool,
                                defined_syms: Optional[Set] = None,
@@ -3304,7 +3305,7 @@ def to_json(self, parent=None):
         json['branches'] = [(condition.to_json() if condition is not None else None, cfg.to_json())
                             for condition, cfg in self._branches]
         return json
-    
+
     @classmethod
     def from_json(cls, json_obj, context=None):
         context = context or {'sdfg': None, 'parent_graph': None}
@@ -3322,7 +3323,7 @@ def from_json(cls, json_obj, context=None):
             else:
                 ret._branches.append((None, ControlFlowRegion.from_json(region, context)))
         return ret
-    
+
     def inline(self) -> Tuple[bool, Any]:
         """
         Inlines the conditional region into its parent control flow region.
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index e75099276f..e0528e6584 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -186,7 +186,7 @@ def validate_control_flow_region(sdfg: 'SDFG',
 
 def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context: bool):
     """ Verifies the correctness of an SDFG by applying multiple tests.
-    
+
         :param sdfg: The SDFG to verify.
         :param references: An optional set keeping seen IDs for object
                            miscopy validation.
@@ -464,7 +464,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
 
         if isinstance(node, (nd.EntryNode, nd.ExitNode)):
             for iconn in node.in_connectors:
-                if (iconn is not None and iconn.startswith("IN_") and ("OUT_" + iconn[3:]) not in node.out_connectors):
+                if (iconn is not None and iconn.startswith("IN_") and not iconn.endswith("_size") and ("OUT_" + iconn[3:]) not in node.out_connectors):
                     raise InvalidSDFGNodeError(
                         "No match for input connector %s in output "
                         "connectors" % iconn,
@@ -685,14 +685,15 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         break
 
         # Check if memlet data matches src or dst nodes
+        # If is read from the size output connector it needs to match the array's size descriptor
         name = e.data.data
         if isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Structure):
             name = None
         if isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Structure):
             name = None
         if (name is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
-                and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn))
-                and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn))):
+                and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn and name != src_node.data + "_size"))
+                and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn and name != dst_node.data + "_size"))):
             raise InvalidSDFGEdgeError(
                 "Memlet data does not match source or destination "
                 "data nodes)",
@@ -716,14 +717,16 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         # Check memlet subset validity with respect to source/destination nodes
         if e.data.data is not None and e.data.allow_oob == False:
             subset_node = (dst_node
-                           if isinstance(dst_node, nd.AccessNode) and e.data.data == dst_node.data else src_node)
+                           if isinstance(dst_node, nd.AccessNode) and e.data.data == dst_node.data or e.data.data == dst_node.data + "_size" else src_node)
             other_subset_node = (dst_node
-                                 if isinstance(dst_node, nd.AccessNode) and e.data.data != dst_node.data else src_node)
+                                 if isinstance(dst_node, nd.AccessNode) and e.data.data != dst_node.data or e.data.data == dst_node.data + "_size" else src_node)
 
             if isinstance(subset_node, nd.AccessNode):
                 arr = sdfg.arrays[subset_node.data]
+                size_arr = sdfg.arrays[subset_node.data + "_size"]
                 # Dimensionality
-                if e.data.subset.dims() != len(arr.shape):
+
+                if e.data.data == subset_node.data and e.data.subset.dims() != len(arr.shape):
                     raise InvalidSDFGEdgeError(
                         "Memlet subset does not match node dimension "
                         "(expected %d, got %d)" % (len(arr.shape), e.data.subset.dims()),
@@ -731,6 +734,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         state_id,
                         eid,
                     )
+                if e.data.data == (subset_node.data + "_size") and e.data.subset.dims() != len(size_arr.shape):
+                    raise InvalidSDFGEdgeError(
+                        "Memlet subset does not match node size dimension "
+                        "(expected %d, got %d)" % (len(size_arr.shape), e.data.subset.dims()),
+                        sdfg,
+                        state_id,
+                        eid,
+                    )
 
                 # Bounds
                 if any(((minel + off) < 0) == True for minel, off in zip(e.data.subset.min_element(), arr.offset)):
@@ -741,10 +752,11 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         raise InvalidSDFGEdgeError("Memlet subset negative out-of-bounds", sdfg, state_id, eid)
                 if any(((maxel + off) >= s) == True
                        for maxel, s, off in zip(e.data.subset.max_element(), arr.shape, arr.offset)):
-                    if e.data.dynamic:
+                    if e.data.dynamic or e.data.data.endswith("_size"):
                         warnings.warn(f'Potential out-of-bounds memlet subset: {e}')
                     else:
-                        raise InvalidSDFGEdgeError("Memlet subset out-of-bounds", sdfg, state_id, eid)
+                        warnings.warn(f'Memlet subset out-of-bounds {sdfg}, {state_id}, {eid}')
+                        #raise InvalidSDFGEdgeError("Memlet subset out-of-bounds", sdfg, state_id, eid)
 
             # Test other_subset as well
             if e.data.other_subset is not None and isinstance(other_subset_node, nd.AccessNode):

From e1442f7d3166fcf3d4987f947dd2475645fff700 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 30 Oct 2024 09:46:26 +0100
Subject: [PATCH 05/51] Refactor

---
 dace/codegen/targets/cpu.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index c11b94d9cd..fd54ed25b5 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -747,6 +747,9 @@ def _emit_copy(
                 state_id,
                 [src_node, dst_node],
             )
+            stream.write(
+                "//u1"
+            )
             return
         elif isinstance(src_node, nodes.Tasklet):
             # Copy out of tasklet
@@ -756,6 +759,9 @@ def _emit_copy(
                 state_id,
                 [src_node, dst_node],
             )
+            stream.write(
+                "//u2"
+            )
             return
         else:  # Copy array-to-array
             src_nodedesc = src_node.desc(sdfg)

From dcbf2a2dbc1826d584caf56eb9e7e6ff067a9a1f Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 30 Oct 2024 10:52:56 +0100
Subject: [PATCH 06/51] Do not rely on naming conventions but save the size
 array descriptor's name

---
 dace/codegen/targets/cpp.py | 15 +++++--
 dace/codegen/targets/cpu.py | 11 ++++--
 dace/data.py                | 79 ++++++++++++++++++++-----------------
 dace/sdfg/sdfg.py           | 38 ++++++++++--------
 4 files changed, 83 insertions(+), 60 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index c34c829c31..d2ec6cf859 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -9,6 +9,7 @@
 import itertools
 import math
 import numbers
+import re
 import sys
 import warnings
 
@@ -568,7 +569,14 @@ def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed
     if packed_veclen > 1:
         index /= packed_veclen
 
-    return sym2cpp(index)
+    size_desc_name = d.size_desc_name
+    if not (size_desc_name is None):
+        access_str_with_deferred_vars = sym2cpp(index)
+        pattern = r'__dace_defer_dim(\d+)'
+        access_str = re.sub(pattern, size_desc_name + r'[\1]', access_str_with_deferred_vars)
+        return access_str
+    else:
+        return sym2cpp(index)
 
 
 def cpp_array_expr(sdfg,
@@ -585,7 +593,8 @@ def cpp_array_expr(sdfg,
     subset = memlet.subset if not use_other_subset else memlet.other_subset
     s = subset if relative_offset else subsets.Indices(offset)
     o = offset if relative_offset else None
-    desc = (sdfg.arrays[memlet.data] if referenced_array is None else referenced_array)
+    desc : dace.Data = (sdfg.arrays[memlet.data] if referenced_array is None else referenced_array)
+    desc_name = memlet.data
     offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices)
 
     # NOTE: Are there any cases where a mix of '.' and '->' is needed when traversing nested structs?
@@ -737,7 +746,7 @@ def is_write_conflicted_with_reason(dfg, edge, datanode=None, sdfg_schedule=None
     Detects whether a write-conflict-resolving edge can be emitted without
     using atomics or critical sections, returning the node or SDFG that caused
     the decision.
-    
+
     :return: None if the conflict is nonatomic, otherwise returns the scope entry
              node or SDFG that caused the decision to be made.
     """
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index fd54ed25b5..d5cca48dba 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -498,8 +498,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                 declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node)
                 # Initialize size array
                 size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
-                size_nodedesc = sdfg.arrays[f"{name}_size"]
-                declaration_stream.write(f'{size_nodedesc.dtype.ctype} {name}_size[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
+                size_desc_name = nodedesc.size_desc_name
+                size_nodedesc = sdfg.arrays[size_desc_name]
+                declaration_stream.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
             if deferred_allocation:
                 allocation_stream.write(
                     "%s = nullptr; // Deferred Allocation" %
@@ -521,7 +522,8 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
 
             define_var(name, DefinedType.Pointer, ctypedef)
-            define_var(name + "_size", DefinedType.Pointer, size_nodedesc.dtype.ctype)
+            if not declared:
+                define_var(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
 
             if node.setzero:
                 allocation_stream.write("memset(%s, 0, sizeof(%s)*%s);" %
@@ -687,10 +689,11 @@ def reallocate(
             "#include <cstdlib>"
         )
         data_name = dst_node.data
-        size_array_name = f"{data_name}_size"
         new_size_array_name = src_node.data
 
         data = sdfg.arrays[data_name]
+        size_array_name = data.size_desc_name
+
         new_size_array = sdfg.arrays[new_size_array_name]
         dtype = sdfg.arrays[data_name].dtype
 
diff --git a/dace/data.py b/dace/data.py
index a07fe42083..8b15246263 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -27,7 +27,7 @@
 
 def create_datadescriptor(obj, no_custom_desc=False):
     """ Creates a data descriptor from various types of objects.
-        
+
         :see: dace.data.Data
     """
     from dace import dtypes  # Avoiding import loops
@@ -541,7 +541,7 @@ class TensorIndex(ABC):
     def iteration_type(self) -> TensorIterationTypes:
         """
         Iteration capability supported by this index.
-        
+
         See TensorIterationTypes for reference.
         """
         pass
@@ -559,7 +559,7 @@ def locate(self) -> bool:
     def assembly(self) -> TensorAssemblyType:
         """
         What assembly type is supported by the index.
-        
+
         See TensorAssemblyType for reference.
         """
         pass
@@ -569,7 +569,7 @@ def assembly(self) -> TensorAssemblyType:
     def full(self) -> bool:
         """
         True if the level is full, False otw.
-         
+
         A level is considered full if it encompasses all valid coordinates along
         the corresponding tensor dimension.
         """
@@ -580,7 +580,7 @@ def full(self) -> bool:
     def ordered(self) -> bool:
         """
         True if the level is ordered, False otw.
-        
+
         A level is ordered when all coordinates that share the same ancestor are
         ordered by increasing value (e.g. in typical CSR).
         """
@@ -591,7 +591,7 @@ def ordered(self) -> bool:
     def unique(self) -> bool:
         """
         True if coordinate in the level are unique, False otw.
-        
+
         A level is considered unique if no collection of coordinates that share
         the same ancestor contains duplicates. In CSR this is True, in COO it is
         not.
@@ -603,7 +603,7 @@ def unique(self) -> bool:
     def branchless(self) -> bool:
         """
         True if the level doesn't branch, false otw.
-        
+
         A level is considered branchless if no coordinate has a sibling (another
         coordinate with same ancestor) and all coordinates in parent level have
         a child. In other words if there is a bijection between the coordinates
@@ -617,7 +617,7 @@ def branchless(self) -> bool:
     def compact(self) -> bool:
         """
         True if the level is compact, false otw.
-        
+
         A level is compact if no two coordinates are separated by an unlabled
         node that does not encode a coordinate. An example of a compact level
         can be found in CSR, while the DIA formats range and offset levels are
@@ -630,7 +630,7 @@ def compact(self) -> bool:
     def fields(self, lvl: int, dummy_symbol: symbolic.SymExpr) -> Dict[str, Data]:
         """
         Generates the fields needed for the index.
-        
+
         :return: a Dict of fields that need to be present in the struct
         """
         pass
@@ -668,7 +668,7 @@ def from_json(cls, json_obj, context=None):
 class TensorIndexDense(TensorIndex):
     """
     Dense tensor index.
-    
+
     Levels of this type encode the the coordinate in the interval [0, N), where
     N is the size of the corresponding dimension. This level doesn't need any
     index structure beyond the corresponding dimension size.
@@ -735,9 +735,9 @@ def __repr__(self) -> str:
 class TensorIndexCompressed(TensorIndex):
     """
     Tensor level that stores coordinates in segmented array.
-    
+
     Levels of this type are compressed using a segented array. The pos array
-    holds the start and end positions of the segment in the crd (coordinate) 
+    holds the start and end positions of the segment in the crd (coordinate)
     array that holds the child coordinates corresponding the parent.
     """
 
@@ -809,7 +809,7 @@ def __repr__(self) -> str:
 class TensorIndexSingleton(TensorIndex):
     """
     Tensor index that encodes a single coordinate per parent coordinate.
-    
+
     Levels of this type hold exactly one coordinate for every coordinate in the
     parent level. An example can be seen in the COO format, where every
     coordinate but the first is encoded in this manner.
@@ -882,7 +882,7 @@ def __repr__(self) -> str:
 class TensorIndexRange(TensorIndex):
     """
     Tensor index that encodes a interval of coordinates for every parent.
-    
+
     The interval is computed from an offset for each parent together with the
     tensor dimension size of this level (M) and the parent level (N) parents
     corresponding tensor. Given the parent coordinate i, the level encodes the
@@ -952,7 +952,7 @@ def __repr__(self) -> str:
 class TensorIndexOffset(TensorIndex):
     """
     Tensor index that encodes the next coordinates as offset from parent.
-    
+
     Given a parent coordinate i and an offset index k, the level encodes the
     coordinate j = i + offset[k].
     """
@@ -1020,7 +1020,7 @@ def __repr__(self) -> str:
 class Tensor(Structure):
     """
     Abstraction for Tensor storage format.
-    
+
     This abstraction is based on [https://doi.org/10.1145/3276493].
     """
 
@@ -1047,7 +1047,7 @@ def __init__(self,
         Below are examples of common matrix storage formats:
 
         .. code-block:: python
-            
+
             M, N, nnz = (dace.symbol(s) for s in ('M', 'N', 'nnz'))
 
             csr = dace.data.Tensor(
@@ -1123,7 +1123,7 @@ def __init__(self,
 
         :param value_type: data type of the explicitly stored values.
         :param tensor_shape: logical shape of tensor (#rows, #cols, etc...)
-        :param indices: 
+        :param indices:
             a list of tuples, each tuple represents a level in the tensor
             storage hirachy, specifying the levels tensor index type, and the
             corresponding dimension this level encodes (as index of the
@@ -1298,12 +1298,12 @@ class Array(Data):
     how it should behave.
 
     The array definition is flexible in terms of data allocation, it allows arbitrary multidimensional, potentially
-    symbolic shapes (e.g., an array with size ``N+1 x M`` will have ``shape=(N+1, M)``), of arbitrary data 
+    symbolic shapes (e.g., an array with size ``N+1 x M`` will have ``shape=(N+1, M)``), of arbitrary data
     typeclasses (``dtype``). The physical data layout of the array is controlled by several properties:
 
        * The ``strides`` property determines the ordering and layout of the dimensions --- it specifies how many
          elements in memory are skipped whenever one element in that dimension is advanced. For example, the contiguous
-         dimension always has a stride of ``1``; a C-style MxN array will have strides ``(N, 1)``, whereas a 
+         dimension always has a stride of ``1``; a C-style MxN array will have strides ``(N, 1)``, whereas a
          FORTRAN-style array of the same size will have ``(1, M)``. Strides can be larger than the shape, which allows
          post-padding of the contents of each dimension.
        * The ``start_offset`` property is a number of elements to pad the beginning of the memory buffer with. This is
@@ -1320,7 +1320,7 @@ class Array(Data):
          to zero.
 
     To summarize with an example, a two-dimensional array with pre- and post-padding looks as follows:
-    
+
     .. code-block:: text
 
         [xxx][          |xx]
@@ -1338,7 +1338,7 @@ class Array(Data):
 
 
     Notice that the last padded row does not appear in strides, but is a consequence of ``total_size`` being larger.
-    
+
 
     Apart from memory layout, other properties of ``Array`` help the data-centric transformation infrastructure make
     decisions about the array. ``allow_conflicts`` states that warnings should not be printed if potential conflicted
@@ -1378,6 +1378,9 @@ class Array(Data):
                         'it is inferred by other properties and the OptionalArrayInference pass.')
     pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred')
 
+    size_desc_name = Property(dtype=str, default=None, allow_none=True, desc='The name of the size array (1D, length is the shape of thte current array)'
+                                                                             'Of the array (usually <name>_size)')
+
     def __init__(self,
                  dtype,
                  shape,
@@ -1394,13 +1397,15 @@ def __init__(self,
                  total_size=None,
                  start_offset=None,
                  optional=None,
-                 pool=False):
+                 pool=False,
+                 size_desc_name=None):
 
         super(Array, self).__init__(dtype, shape, transient, storage, location, lifetime, debuginfo)
 
         self.allow_conflicts = allow_conflicts
         self.may_alias = may_alias
         self.alignment = alignment
+        self.size_desc_name = size_desc_name
 
         if start_offset is not None:
             self.start_offset = start_offset
@@ -1820,7 +1825,7 @@ def from_json(cls, json_obj, context=None):
 
 
 class View:
-    """ 
+    """
     Data descriptor that acts as a static reference (or view) of another data container.
     Can be used to reshape or reinterpret existing data without copying it.
 
@@ -1832,9 +1837,9 @@ class View:
         node, and the other side (out/in) has a different number of edges.
       * If there is one incoming and one outgoing edge, and one leads to a code
         node, the one that leads to an access node is the viewed data.
-      * If both sides lead to access nodes, if one memlet's data points to the 
+      * If both sides lead to access nodes, if one memlet's data points to the
         view it cannot point to the viewed node.
-      * If both memlets' data are the respective access nodes, the access 
+      * If both memlets' data are the respective access nodes, the access
         node at the highest scope is the one that is viewed.
       * If both access nodes reside in the same scope, the input data is viewed.
 
@@ -1903,11 +1908,11 @@ def view(viewed_container: Data, debuginfo=None):
 
 
 class Reference:
-    """ 
+    """
     Data descriptor that acts as a dynamic reference of another data descriptor. It can be used just like a regular
     data descriptor, except that it could be set to an arbitrary container (or subset thereof) at runtime. To set a
     reference, connect another access node to it and use the "set" connector.
-    
+
     In order to enable data-centric analysis and optimizations, avoid using References as much as possible.
     """
 
@@ -1958,7 +1963,7 @@ def view(viewed_container: Data, debuginfo=None):
 
 @make_properties
 class ArrayView(Array, View):
-    """ 
+    """
     Data descriptor that acts as a static reference (or view) of another array. Can
     be used to reshape or reinterpret existing data without copying it.
 
@@ -1982,7 +1987,7 @@ def as_array(self):
 
 @make_properties
 class StructureView(Structure, View):
-    """ 
+    """
     Data descriptor that acts as a view of another structure.
     """
 
@@ -2013,7 +2018,7 @@ def as_structure(self):
 
 @make_properties
 class ContainerView(ContainerArray, View):
-    """ 
+    """
     Data descriptor that acts as a view of another container array. Can
     be used to access nested container types without a copy.
     """
@@ -2055,9 +2060,9 @@ def as_array(self):
 
 @make_properties
 class ArrayReference(Array, Reference):
-    """ 
+    """
     Data descriptor that acts as a dynamic reference of another array. See ``Reference`` for more information.
-    
+
     In order to enable data-centric analysis and optimizations, avoid using References as much as possible.
     """
 
@@ -2077,9 +2082,9 @@ def as_array(self):
 
 @make_properties
 class StructureReference(Structure, Reference):
-    """ 
+    """
     Data descriptor that acts as a dynamic reference of another Structure. See ``Reference`` for more information.
-    
+
     In order to enable data-centric analysis and optimizations, avoid using References as much as possible.
     """
 
@@ -2102,10 +2107,10 @@ def as_structure(self):
 
 @make_properties
 class ContainerArrayReference(ContainerArray, Reference):
-    """ 
+    """
     Data descriptor that acts as a dynamic reference of another data container array. See ``Reference`` for more
     information.
-    
+
     In order to enable data-centric analysis and optimizations, avoid using References as much as possible.
     """
 
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index f91e62df97..36572647e1 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1714,20 +1714,6 @@ def add_array(self,
         if isinstance(dtype, type) and dtype in dtypes._CONSTANT_TYPES[:-1]:
             dtype = dtypes.typeclass(dtype)
 
-        desc = dt.Array(dtype=dtype,
-                        shape=shape,
-                        storage=storage,
-                        location=location,
-                        allow_conflicts=allow_conflicts,
-                        transient=transient,
-                        strides=strides,
-                        offset=offset,
-                        lifetime=lifetime,
-                        alignment=alignment,
-                        debuginfo=debuginfo,
-                        total_size=total_size,
-                        may_alias=may_alias)
-
         size_desc = dt.Array(dtype=dace.uint64,
                             shape=(len(shape),),
                             storage=storage,
@@ -1740,10 +1726,30 @@ def add_array(self,
                             alignment=alignment,
                             debuginfo=debuginfo,
                             total_size=len(shape),
-                            may_alias=False)
+                            may_alias=False,
+                            size_desc_name=None)
+
+        desc = dt.Array(dtype=dtype,
+                        shape=shape,
+                        storage=storage,
+                        location=location,
+                        allow_conflicts=allow_conflicts,
+                        transient=transient,
+                        strides=strides,
+                        offset=offset,
+                        lifetime=lifetime,
+                        alignment=alignment,
+                        debuginfo=debuginfo,
+                        total_size=total_size,
+                        may_alias=may_alias,
+                        size_desc_name=None)
 
         array_name = self.add_datadesc(name, desc, find_new_name=find_new_name)
-        self.add_datadesc(f"{array_name}_size", size_desc, find_new_name=False)
+        size_desc_name = f"{array_name}_size"
+        self.add_datadesc(size_desc_name, size_desc, find_new_name=False)
+        # In case find_new_name and a new name is returned
+        # we need to update the size descriptor name of the array
+        desc.size_desc_name = size_desc_name
         return array_name, desc
 
     def add_view(self,

From e51698598c78fcf0b719d7ca7f457f1f1df3c93b Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Mon, 4 Nov 2024 21:53:03 +0100
Subject: [PATCH 07/51] dace/sdfg/validation.py

---
 dace/sdfg/validation.py | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 891b321144..2df9e17445 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -191,7 +191,7 @@ def validate_control_flow_region(sdfg: 'SDFG',
 
 def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context: bool):
     """ Verifies the correctness of an SDFG by applying multiple tests.
-
+    
         :param sdfg: The SDFG to verify.
         :param references: An optional set keeping seen IDs for object
                            miscopy validation.
@@ -471,7 +471,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
 
         if isinstance(node, (nd.EntryNode, nd.ExitNode)):
             for iconn in node.in_connectors:
-                if (iconn is not None and iconn.startswith("IN_") and not iconn.endswith("_size") and ("OUT_" + iconn[3:]) not in node.out_connectors):
+                if (iconn is not None and iconn.startswith("IN_") and ("OUT_" + iconn[3:]) not in node.out_connectors):
                     raise InvalidSDFGNodeError(
                         "No match for input connector %s in output "
                         "connectors" % iconn,
@@ -692,15 +692,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         break
 
         # Check if memlet data matches src or dst nodes
-        # If is read from the size output connector it needs to match the array's size descriptor
         name = e.data.data
         if isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Structure):
             name = None
         if isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Structure):
             name = None
         if (name is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
-                and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn and name != src_node.data + "_size"))
-                and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn and name != dst_node.data + "_size"))):
+                and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn))
+                and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn))):
             raise InvalidSDFGEdgeError(
                 "Memlet data does not match source or destination "
                 "data nodes)",
@@ -724,16 +723,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         # Check memlet subset validity with respect to source/destination nodes
         if e.data.data is not None and e.data.allow_oob == False:
             subset_node = (dst_node
-                           if isinstance(dst_node, nd.AccessNode) and e.data.data == dst_node.data or e.data.data == dst_node.data + "_size" else src_node)
+                           if isinstance(dst_node, nd.AccessNode) and e.data.data == dst_node.data else src_node)
             other_subset_node = (dst_node
-                                 if isinstance(dst_node, nd.AccessNode) and e.data.data != dst_node.data or e.data.data == dst_node.data + "_size" else src_node)
+                                 if isinstance(dst_node, nd.AccessNode) and e.data.data != dst_node.data else src_node)
 
             if isinstance(subset_node, nd.AccessNode):
                 arr = sdfg.arrays[subset_node.data]
-                size_arr = sdfg.arrays[subset_node.data + "_size"]
                 # Dimensionality
-
-                if e.data.data == subset_node.data and e.data.subset.dims() != len(arr.shape):
+                if e.data.subset.dims() != len(arr.shape):
                     raise InvalidSDFGEdgeError(
                         "Memlet subset does not match node dimension "
                         "(expected %d, got %d)" % (len(arr.shape), e.data.subset.dims()),
@@ -741,14 +738,6 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         state_id,
                         eid,
                     )
-                if e.data.data == (subset_node.data + "_size") and e.data.subset.dims() != len(size_arr.shape):
-                    raise InvalidSDFGEdgeError(
-                        "Memlet subset does not match node size dimension "
-                        "(expected %d, got %d)" % (len(size_arr.shape), e.data.subset.dims()),
-                        sdfg,
-                        state_id,
-                        eid,
-                    )
 
                 # Bounds
                 if any(((minel + off) < 0) == True for minel, off in zip(e.data.subset.min_element(), arr.offset)):
@@ -759,11 +748,10 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                         raise InvalidSDFGEdgeError("Memlet subset negative out-of-bounds", sdfg, state_id, eid)
                 if any(((maxel + off) >= s) == True
                        for maxel, s, off in zip(e.data.subset.max_element(), arr.shape, arr.offset)):
-                    if e.data.dynamic or e.data.data.endswith("_size"):
+                    if e.data.dynamic:
                         warnings.warn(f'Potential out-of-bounds memlet subset: {e}')
                     else:
-                        warnings.warn(f'Memlet subset out-of-bounds {sdfg}, {state_id}, {eid}')
-                        #raise InvalidSDFGEdgeError("Memlet subset out-of-bounds", sdfg, state_id, eid)
+                        raise InvalidSDFGEdgeError("Memlet subset out-of-bounds", sdfg, state_id, eid)
 
             # Test other_subset as well
             if e.data.other_subset is not None and isinstance(other_subset_node, nd.AccessNode):

From 925f8c7a9902f5f146a329489d4104bddc15018a Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 5 Nov 2024 09:51:25 +0100
Subject: [PATCH 08/51] Improve validation

---
 dace/sdfg/validation.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 2df9e17445..1e1db1b621 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -191,7 +191,7 @@ def validate_control_flow_region(sdfg: 'SDFG',
 
 def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context: bool):
     """ Verifies the correctness of an SDFG by applying multiple tests.
-    
+
         :param sdfg: The SDFG to verify.
         :param references: An optional set keeping seen IDs for object
                            miscopy validation.
@@ -697,9 +697,20 @@ def validate_state(state: 'dace.sdfg.SDFGState',
             name = None
         if isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Structure):
             name = None
+        # Special case: if the name is the size array of the src_node, then it is ok, checked with the "size_desc_name"
+        src_size_access = isinstance(src_node, nd.AccessNode) and name == sdfg.arrays[src_node.data].size_desc_name
+        dst_size_access = isinstance(dst_node, nd.AccessNode) and name == sdfg.arrays[dst_node.data].size_desc_name
+        if src_size_access and dst_size_access:
+            raise InvalidSDFGEdgeError(
+                "Reading from the size connector and writing to the size connector at the same time of same data is not valid",
+                sdfg,
+                state_id,
+                eid,
+            )
         if (name is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
-                and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn))
-                and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn))):
+                and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn and not src_size_access))
+                and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn and not dst_size_access))
+            ):
             raise InvalidSDFGEdgeError(
                 "Memlet data does not match source or destination "
                 "data nodes)",
@@ -728,7 +739,12 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                                  if isinstance(dst_node, nd.AccessNode) and e.data.data != dst_node.data else src_node)
 
             if isinstance(subset_node, nd.AccessNode):
-                arr = sdfg.arrays[subset_node.data]
+                if src_size_access:
+                    arr = sdfg.arrays[sdfg.arrays[src_node.data].size_desc_name]
+                elif dst_size_access:
+                    arr = sdfg.arrays[sdfg.arrays[dst_node.data].size_desc_name]
+                else:
+                    arr = sdfg.arrays[subset_node.data]
                 # Dimensionality
                 if e.data.subset.dims() != len(arr.shape):
                     raise InvalidSDFGEdgeError(

From 93eae375ef5b6b8d3754533db4779003a0b46af3 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 5 Nov 2024 10:44:22 +0100
Subject: [PATCH 09/51] More validation cases

---
 dace/sdfg/validation.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 1e1db1b621..9d5bd8f1bc 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -700,6 +700,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         # Special case: if the name is the size array of the src_node, then it is ok, checked with the "size_desc_name"
         src_size_access = isinstance(src_node, nd.AccessNode) and name == sdfg.arrays[src_node.data].size_desc_name
         dst_size_access = isinstance(dst_node, nd.AccessNode) and name == sdfg.arrays[dst_node.data].size_desc_name
+        sdict = state.scope_dict()
         if src_size_access and dst_size_access:
             raise InvalidSDFGEdgeError(
                 "Reading from the size connector and writing to the size connector at the same time of same data is not valid",
@@ -707,6 +708,22 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                 state_id,
                 eid,
             )
+        if dst_size_access and sdict[dst_node] is not None:
+            raise InvalidSDFGEdgeError(
+                "Reallocating data (writing to the size connector) within a scope is not valid",
+                sdfg,
+                state_id,
+                eid,
+            )
+        if dst_size_access:
+            dst_arr = sdfg.arrays[dst_node.data]
+            if dst_arr.storage != dace.dtypes.StorageType.GPU_Global or dst_arr.storage != dace.dtypes.StorageType.CPU_Heap:
+                raise InvalidSDFGEdgeError(
+                    "Reallocating data (writing to the size connector) within a scope is not valid",
+                    sdfg,
+                    state_id,
+                    eid,
+                )
         if (name is not None and (isinstance(src_node, nd.AccessNode) or isinstance(dst_node, nd.AccessNode))
                 and (not isinstance(src_node, nd.AccessNode) or (name != src_node.data and name != e.src_conn and not src_size_access))
                 and (not isinstance(dst_node, nd.AccessNode) or (name != dst_node.data and name != e.dst_conn and not dst_size_access))

From 5b55425655b41c85d6f20e733aa8302db2e4ea61 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 5 Nov 2024 16:14:48 +0100
Subject: [PATCH 10/51] Add support for deferred allocation on GPU global
 arrays

---
 dace/codegen/targets/cpp.py  |  16 ++--
 dace/codegen/targets/cpu.py  |  21 ++++-
 dace/codegen/targets/cuda.py | 170 ++++++++++++++++++++++++++++++-----
 3 files changed, 176 insertions(+), 31 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index d2ec6cf859..632d19e714 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -539,7 +539,8 @@ def ndcopy_to_strided_copy(
         return None
 
 
-def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed_veclen=1, indices=None):
+def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed_veclen=1, indices=None,
+                    deferred_size_names=None):
     """ Creates a C++ expression that can be added to a pointer in order
         to offset it to the beginning of the given subset and offset.
 
@@ -569,11 +570,13 @@ def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed
     if packed_veclen > 1:
         index /= packed_veclen
 
-    size_desc_name = d.size_desc_name
-    if not (size_desc_name is None):
+    if not (deferred_size_names is None):
         access_str_with_deferred_vars = sym2cpp(index)
+        def replace_pattern(match):
+            number = match.group(1)
+            return deferred_size_names[int(number)]
         pattern = r'__dace_defer_dim(\d+)'
-        access_str = re.sub(pattern, size_desc_name + r'[\1]', access_str_with_deferred_vars)
+        access_str = re.sub(pattern, replace_pattern, access_str_with_deferred_vars)
         return access_str
     else:
         return sym2cpp(index)
@@ -588,14 +591,15 @@ def cpp_array_expr(sdfg,
                    use_other_subset=False,
                    indices=None,
                    referenced_array=None,
-                   codegen=None):
+                   codegen=None,
+                   deferred_size_names=None):
     """ Converts an Indices/Range object to a C++ array access string. """
     subset = memlet.subset if not use_other_subset else memlet.other_subset
     s = subset if relative_offset else subsets.Indices(offset)
     o = offset if relative_offset else None
     desc : dace.Data = (sdfg.arrays[memlet.data] if referenced_array is None else referenced_array)
     desc_name = memlet.data
-    offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices)
+    offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices, deferred_size_names=deferred_size_names)
 
     # NOTE: Are there any cases where a mix of '.' and '->' is needed when traversing nested structs?
     # TODO: Study this when changing Structures to be (optionally?) non-pointers.
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index d5cca48dba..ed513f8bd6 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -1164,7 +1164,26 @@ def process_out_memlets(self,
                             write_expr = f"*({ptr_str} + {array_expr}) = {in_local_name};"
                         else:
                             desc_dtype = desc.dtype
-                            expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame)
+                            # If the storage type if CPU_Heap or GPU_Global then it might be requiring deferred allocation
+                            # We can check if the array requires sepcial access using A_size[0] (CPU) or __A_dim0_size (GPU0)
+                            # by going through the shape and checking for symbols starting with __dace_defer
+                            def check_dace_defer(elements):
+                                for elem in elements:
+                                    if isinstance(elem, symbolic.symbol) and str(elem).startswith("__dace_defer"):
+                                        return True
+                                return False
+                            deferred_size_names = None
+                            if check_dace_defer(desc.shape):
+                                if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap:
+                                    deferred_size_names = []
+                                    for i, elem in enumerate(desc.shape):
+                                        if str(elem).startswith("__dace_defer"):
+                                            deferred_size_names.append(f"__{memlet.data}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]")
+                                        else:
+                                            deferred_size_names.append(elem)
+                                else:
+                                    raise Exception("Deferred Allocation only supported on array storages of type GPU_Global or CPU_Heap")
+                            expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame, deferred_size_names=deferred_size_names)
                             write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype)
 
                     # Write out
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 1cf8919d74..d70bb4097d 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -133,6 +133,8 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
                                                     illegal_copy,
                                                     predicate=cpu_to_gpu_cpred)
                 dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, st, sched_type, illegal_copy)
+
+        dispatcher.register_reallocate_dispatcher(dtypes.StorageType.GPU_Global, self)
         # End of illegal copies
         # End of dispatcher registration
         ######################################
@@ -339,7 +341,7 @@ def get_generated_codeobjects(self):
     cudaMemPool_t mempool;
     cudaDeviceGetDefaultMemPool(&mempool, 0);
     uint64_t threshold = {poolcfg if poolcfg != -1 else 'UINT64_MAX'};
-    cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);            
+    cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
 '''
 
         self._codeobject.code = """
@@ -548,6 +550,21 @@ def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi
 
         declaration_stream.write(result_decl.getvalue(), cfg, state_id, node)
 
+    def _alloc_gpu_global(self, node, nodedesc, result_alloc, dataname, arrsize_malloc):
+        if nodedesc.pool:
+            cudastream = getattr(node, '_cuda_stream', 'nullptr')
+            if cudastream != 'nullptr':
+                cudastream = f'__state->gpu_context->streams[{cudastream}]'
+            result_alloc.write(
+                f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n'
+            )
+            self._emit_sync(result_alloc)
+        else:
+            # Strides are left to the user's discretion
+            result_alloc.write('DACE_GPU_CHECK(%sMalloc((void**)&%s, %s));\n' %
+                            (self.backend, dataname, arrsize_malloc))
+
+
     def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int,
                        node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream,
                        declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None:
@@ -586,27 +603,31 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
         is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants)
         arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype)
         ctypedef = '%s *' % nodedesc.dtype.ctype
+        deferred_allocation = any([s for s in nodedesc.shape if str(s).startswith("__dace_defer")])
 
         # Different types of GPU arrays
         if nodedesc.storage == dtypes.StorageType.GPU_Global:
             if not declared:
                 result_decl.write('%s %s;\n' % (ctypedef, dataname))
+                size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
+                size_desc_name = nodedesc.size_desc_name
+                size_nodedesc = sdfg.arrays[size_desc_name]
+                result_decl.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n')
+                self._dispatcher.defined_vars.add(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
 
-            if nodedesc.pool:
-                cudastream = getattr(node, '_cuda_stream', 'nullptr')
-                if cudastream != 'nullptr':
-                    cudastream = f'__state->gpu_context->streams[{cudastream}]'
+
+            if deferred_allocation:
                 result_alloc.write(
-                    f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n'
+                    "%s = nullptr; // Deferred Allocation" %
+                    (dataname,)
                 )
-                self._emit_sync(result_alloc)
             else:
-                # Strides are left to the user's discretion
-                result_alloc.write('DACE_GPU_CHECK(%sMalloc((void**)&%s, %s));\n' %
-                                   (self.backend, dataname, arrsize_malloc))
+                self._alloc_gpu_global(node, nodedesc, result_alloc, dataname, arrsize_malloc)
 
             if node.setzero:
+                if deferred_allocation:
+                    raise Exception("Deferred Allocation and setzero is not supported at the same time.")
                 result_alloc.write('DACE_GPU_CHECK(%sMemset(%s, 0, %s));\n' % (self.backend, dataname, arrsize_malloc))
             if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0:
                 result_alloc.write(f'{dataname} += {cpp.sym2cpp(nodedesc.start_offset)};\n')
@@ -755,7 +776,7 @@ def _compute_cudastreams(self, sdfg: SDFG, default_stream=0, default_event=0):
         """ Annotates an SDFG (and all nested ones) to include a `_cuda_stream`
             field. This field is applied to all GPU maps, tasklets, and copies
             that can be executed in parallel.
-            
+
             :param sdfg: The sdfg to modify.
             :param default_stream: The stream ID to start counting from (used
                                    in recursion to nested SDFGs).
@@ -1460,6 +1481,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             if isinstance(node, nodes.AccessNode):
                 nsdfg: SDFG = parent.parent
                 desc = node.desc(nsdfg)
+                sizedesc = nsdfg.arrays[desc.size_desc_name]
                 if (nsdfg, node.data) in visited:
                     continue
                 visited.add((nsdfg, node.data))
@@ -1485,7 +1507,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         const_params = _get_const_params(dfg_scope)
         # make dynamic map inputs constant
         # TODO move this into _get_const_params(dfg_scope)
-        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry))
+        # Do not add src as const if the size is being red (src_conn is OUT_size)
+        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if not e.src_conn.endswith("size"))
 
         # Store init/exit code streams
         old_entry_stream = self.scope_entry_stream
@@ -1505,6 +1528,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         #       Refactor and fix when nested SDFGs are separate functions.
         self._dispatcher.defined_vars.enter_scope(scope_entry)
         prototype_kernel_args = {}
+        host_size_args = {}
         for aname, arg in kernel_args.items():  # `list` wrapper is used to modify kernel_args within the loop
             if aname in const_params:
                 defined_type, ctype = None, None
@@ -1558,8 +1582,17 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
 
             prototype_kernel_args[aname] = arg
 
+            if aname in sdfg.arrays:
+                size_arr_name = data_desc.size_desc_name
+                size_arr = sdfg.arrays[data_desc.size_desc_name]
+                size_arr_len = size_arr.shape[0]
+                size_arr_dtype = size_arr.dtype.ctype
+                host_size_args[size_arr_name] = size_arr
+
         kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k)
                              for k, v in prototype_kernel_args.items()]
+        host_size_args_typed = ['const ' + v.as_arg(name=k)
+                             for k, v in host_size_args.items()]
 
         kernel_stream = CodeIOStream()
         self.generate_kernel_scope(sdfg, cfg, dfg_scope, state_id, scope_entry.map, kernel_name, grid_dims, block_dims,
@@ -1585,9 +1618,26 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})'
 
         # Write kernel prototype
+        dyn_args = []
+        dyn_args_typed = []
+        for e in dace.sdfg.dynamic_map_inputs(state, scope_entry):
+            dyn_args.append(e.dst_conn)
+            dyn_args_typed.append(f"const {e.dst.in_connectors[e.dst_conn]} {e.dst_conn}")
+        # Size arrays
+        needed_size_scalars_declaration = []
+        for size_desc_name, arg in host_size_args.items():
+            if isinstance(arg, dt.Array):
+                size_arr = arg
+                arr_name = size_desc_name.removesuffix("_size")
+                for i in range(size_arr.shape[0]):
+                    if f"__{arr_name}_dim{i}_size" not in dyn_args:
+                        dyn_args.append(f"__{arr_name}_dim{i}_size")
+                        dyn_args_typed.append(f"const {dace.uint64} __{arr_name}_dim{i}_size")
+                        needed_size_scalars_declaration.append(f"const {dace.uint64} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
+
         self._localcode.write(
             '__global__ void %s %s(%s) {\n' %
-            (launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed)), sdfg, state_id, node)
+            (launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed + dyn_args_typed)), sdfg, state_id, node)
 
         # Write constant expressions in GPU code
         self._frame.generate_constants(sdfg, self._localcode)
@@ -1612,7 +1662,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
 DACE_EXPORTED void __dace_runkernel_{fname}({fargs});
 void __dace_runkernel_{fname}({fargs})
 {{
-""".format(fname=kernel_name, fargs=', '.join(state_param + kernel_args_typed + extra_call_args_typed)), cfg, state_id,
+""".format(fname=kernel_name, fargs=', '.join(state_param + kernel_args_typed + extra_call_args_typed + host_size_args_typed)), cfg, state_id,
             node)
 
         if is_persistent:
@@ -1659,9 +1709,13 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
 
         # make sure dynamic map inputs are properly handled
         for e in dace.sdfg.dynamic_map_inputs(state, scope_entry):
+            memlet_definition = self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn])
             self._localcode.write(
-                self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
+                memlet_definition,
                 cfg, state_id, scope_entry)
+        self._localcode.write("// Array sizes of arrays are passed to the kernel even if not used in maps")
+        for decl in needed_size_scalars_declaration:
+            self._localcode.write(decl, cfg, state_id, scope_entry)
 
         gdims = 'dace_number_blocks, 1, 1' if is_persistent else ', '.join(_topy(grid_dims))
         bdims = ', '.join(_topy(block_dims))
@@ -1696,7 +1750,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
 void  *{kname}_args[] = {{ {kargs} }};
 gpuError_t __err = {backend}LaunchKernel((void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream});'''
             .format(kname=kernel_name,
-                    kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args] + extra_kernel_args),
+                    kargs=', '.join(['(void *)&' + arg for arg in prototype_kernel_args]
+                                    + ['(void *)&' + arg for arg in dyn_args]
+                                    + extra_kernel_args),
                     gdims=gdims,
                     bdims=bdims,
                     dynsmem=_topy(dynsmem_size),
@@ -1714,7 +1770,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         # Add invocation to calling code (in another file)
         function_stream.write(
             'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' %
-            (kernel_name, ', '.join(state_param + kernel_args_typed + extra_call_args_typed)), cfg, state_id,
+            (kernel_name, ', '.join(state_param + kernel_args_typed + extra_call_args_typed + host_size_args_typed)), cfg, state_id,
             scope_entry)
 
         # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions.
@@ -1737,7 +1793,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             '__dace_runkernel_%s(%s);\n' %
             (kernel_name,
              ', '.join(['__state'] + [cpp.ptr(aname, arg, sdfg, self._frame)
-                                      for aname, arg in kernel_args.items()] + extra_call_args)), cfg, state_id,
+                                      for aname, arg in kernel_args.items()] + extra_call_args
+                                      +  list(host_size_args.keys()))), cfg, state_id,
             scope_entry)
 
         # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions.
@@ -2008,12 +2065,12 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
         dsym = [symbolic.symbol('__DAPB%d' % i, nonnegative=True, integer=True) for i in range(len(krange))]
         bidx = krange.coord_at(dsym)
 
-        # handle dynamic map inputs
-        for e in dace.sdfg.dynamic_map_inputs(sdfg.states()[state_id], dfg_scope.source_nodes()[0]):
-            kernel_stream.write(
-                self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
-                cfg, state_id,
-                dfg_scope.source_nodes()[0])
+        # Dynamic map inputs are input arguments
+        #for e in dace.sdfg.dynamic_map_inputs(sdfg.states()[state_id], dfg_scope.source_nodes()[0]):
+        #    kernel_stream.write(
+        #        self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
+        #        cfg, state_id,
+        #        dfg_scope.source_nodes()[0])
 
         # do not generate an index if the kernel map is persistent
         if node.map.schedule != dtypes.ScheduleType.GPU_Persistent:
@@ -2715,6 +2772,71 @@ def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgra
     def make_ptr_vector_cast(self, *args, **kwargs):
         return cpp.make_ptr_vector_cast(*args, **kwargs)
 
+    def reallocate(
+        self,
+        sdfg: SDFG,
+        cfg: ControlFlowRegion,
+        dfg: StateSubgraphView,
+        state_id: int,
+        src_node: nodes.AccessNode,
+        dst_node: nodes.AccessNode,
+        edge: Tuple[nodes.Node, Optional[str], nodes.Node, Optional[str], dace.memlet.Memlet],
+        function_stream: CodeIOStream,
+        callsite_stream: CodeIOStream,
+    ):
+        function_stream.write(
+            "#include <cstdlib>"
+        )
+        data_name = dst_node.data
+        new_size_array_name = src_node.data
+
+        data = sdfg.arrays[data_name]
+        size_array_name = data.size_desc_name
+
+        new_size_array = sdfg.arrays[new_size_array_name]
+        dtype = sdfg.arrays[data_name].dtype
+
+        # Only consider the offsets with __dace_defer in original dim
+        mask_array = [str(dim).startswith("__dace_defer") for dim in data.shape]
+
+        # Call realloc only after no __dace_defer is left in size_array (must be true)
+        # Save new and old sizes before registering them, because we need both to compute the bound of the new array
+        old_size_str = " * ".join([f"{size_array_name}[{i}]" for i in range(len(data.shape))])
+        old_size_str += f" * sizeof({dtype.ctype})"
+        new_size_str = " * ".join([f"{new_size_array_name}[{i}]" if mask_array[i] else f"{size_array_name}[{i}]" for i in range(len(data.shape)) ])
+        new_size_str += f" * sizeof({dtype.ctype})"
+        tmp_storage_name = "__tmp_realloc_move_storage"
+
+        callsite_stream.write(f"if ({dst_node.data} == nullptr) {{", cfg, state_id, dst_node.guid)
+        self._alloc_gpu_global(dst_node, data, callsite_stream, data_name, new_size_str)
+        callsite_stream.write("} else {\n", cfg, state_id, dst_node.guid)
+        callsite_stream.write(f"{dtype}* {tmp_storage_name};")
+        self._alloc_gpu_global(None, data, callsite_stream, tmp_storage_name, new_size_str)
+        s = ""
+        if not data.pool:  # If pooled, will be freed somewhere else
+            copy_size_str = f"Min({old_size_str}, {new_size_str})"
+            s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, cudaMemcpyDeviceToDevice));\n"
+            s += f"DACE_GPU_CHECK({self.backend}Free({data_name}));\n"
+            s += f"{data_name} = {tmp_storage_name};\n"
+        else:
+            cudastream = getattr(dst_node, '_cuda_stream', 'nullptr')
+            if cudastream != 'nullptr':
+                cudastream = f'__state->gpu_context->streams[{cudastream}]'
+            s += f'DACE_GPU_CHECK({self.backend}MallocAsync(static_cast<void**>(&{data_name}), {new_size_str}, {cudastream}));\n'
+            s += f"DACE_GPU_CHECK({self.backend}MemcpyAsync(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, {cudastream}), cudaMemcpyDeviceToDevice));\n"
+            s += f"DACE_GPU_CHECK({self.backend}FreeAsync({data_name}, {cudastream}));\n"
+            callsite_stream.write(s)
+            self._emit_sync(callsite_stream)
+            callsite_stream.write(f"{data_name} = {tmp_storage_name};\n")
+            s = ""
+        s += "}\n"
+        callsite_stream.write(s)
+
+        for i, mask in enumerate(mask_array):
+            if mask:
+                callsite_stream.write(
+                    f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
+                )
 
 ########################################################################
 ########################################################################

From c7836685ba53e36131426058dba8780b8ebd0d1f Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 5 Nov 2024 17:27:03 +0100
Subject: [PATCH 11/51] Non-transient support attempt 1

---
 dace/codegen/targets/cpu.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index ed513f8bd6..3dc100481b 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -338,6 +338,12 @@ def declare_array(self,
 
             declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node)
             self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef)
+
+            size_arr_name = sdfg.arrays[name].size_desc_name
+            size_arr_desc = sdfg.arrays[size_arr_name]
+            size_ctypedef = dtypes.pointer(size_arr_desc.dtype).ctype
+
+            self._dispatcher.declared_arrays.add(size_arr_name, DefinedType.Pointer, size_ctypedef)
             return
         elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal:
             # Define pointer once
@@ -388,6 +394,8 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
             # Check if array is already declared
             declared = self._dispatcher.declared_arrays.has(name)
 
+        print("D1", nodedesc, declared)
+
         define_var = self._dispatcher.defined_vars.add
         if top_lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             define_var = self._dispatcher.defined_vars.add_global
@@ -520,7 +528,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                     node
                 )
 
-
+            print(nodedesc, declared)
             define_var(name, DefinedType.Pointer, ctypedef)
             if not declared:
                 define_var(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)

From dc81d6977474ebae1cb575f46390eb73ed26a6e9 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 6 Nov 2024 09:53:01 +0100
Subject: [PATCH 12/51] Improvements in GPU_Global support

---
 dace/codegen/targets/cpu.py       | 9 +++++----
 dace/codegen/targets/fpga.py      | 1 +
 dace/codegen/targets/framecode.py | 3 ++-
 dace/sdfg/sdfg.py                 | 4 ++--
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 3dc100481b..a211fa7d5c 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -319,6 +319,7 @@ def declare_array(self,
 
         name = node.root_data
         ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame)
+        print("D2", name, nodedesc)
 
         if nodedesc.transient is False:
             return
@@ -365,7 +366,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                        allocate_nested_data: bool = True) -> None:
         alloc_name = cpp.ptr(node.data, nodedesc, sdfg, self._frame)
         name = alloc_name
-
         tokens = node.data.split('.')
         top_desc = sdfg.arrays[tokens[0]]
         # NOTE: Assuming here that all Structure members share transient/storage/lifetime properties.
@@ -374,9 +374,11 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
         top_storage = top_desc.storage
         top_lifetime = top_desc.lifetime
 
+
         if top_transient is False:
             return
 
+
         # Check if array is already allocated
         if self._dispatcher.defined_vars.has(name):
             return
@@ -394,8 +396,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
             # Check if array is already declared
             declared = self._dispatcher.declared_arrays.has(name)
 
-        print("D1", nodedesc, declared)
-
         define_var = self._dispatcher.defined_vars.add
         if top_lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External):
             define_var = self._dispatcher.defined_vars.add_global
@@ -488,6 +488,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                   ((symbolic.issymbolic(arrsize, sdfg.constants)) or
                    (arrsize_bytes and ((arrsize_bytes > Config.get("compiler", "max_stack_array_size")) == True))))):
 
+
             if nodedesc.storage == dtypes.StorageType.Register:
 
                 if symbolic.issymbolic(arrsize, sdfg.constants):
@@ -528,7 +529,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                     node
                 )
 
-            print(nodedesc, declared)
             define_var(name, DefinedType.Pointer, ctypedef)
             if not declared:
                 define_var(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
@@ -2264,6 +2264,7 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
             memlet_path = state_dfg.memlet_path(edge)
             if memlet_path[-1].dst == node:
                 src_node = memlet_path[0].src
+
                 if in_connector == "IN_size":
                     self._dispatcher.dispatch_reallocate(
                         src_node,
diff --git a/dace/codegen/targets/fpga.py b/dace/codegen/targets/fpga.py
index 0c74d6ec07..d1e3d57d17 100644
--- a/dace/codegen/targets/fpga.py
+++ b/dace/codegen/targets/fpga.py
@@ -2365,6 +2365,7 @@ def generate_host_function_boilerplate(self, sdfg, cfg, state, nested_global_tra
 
         # Any extra transients stored in global memory on the FPGA must now be
         # allocated and passed to the kernel
+
         for arr_node in nested_global_transients:
             self._dispatcher.dispatch_allocate(sdfg, cfg, state, None, arr_node, arr_node.desc(sdfg), None,
                                                host_code_stream)
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index d71ea40fee..97af9266f9 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -105,7 +105,7 @@ def dispatcher(self):
     def preprocess(self, sdfg: SDFG) -> None:
         """
         Called before code generation. Used for making modifications on the SDFG prior to code generation.
-        
+
         :note: Post-conditions assume that the SDFG will NOT be changed after this point.
         :param sdfg: The SDFG to modify in-place.
         """
@@ -906,6 +906,7 @@ def generate_code(self,
         # Allocate inter-state variables
         global_symbols = copy.deepcopy(sdfg.symbols)
         global_symbols.update({aname: arr.dtype for aname, arr in sdfg.arrays.items()})
+
         interstate_symbols = {}
         for cfr in sdfg.all_control_flow_regions():
             for e in cfr.dfs_edges(cfr.start_block):
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 349adfd2dc..6d58554d94 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1716,8 +1716,8 @@ def add_array(self,
 
         size_desc = dt.Array(dtype=dace.uint64,
                             shape=(len(shape),),
-                            storage=storage,
-                            location=location,
+                            storage=dtypes.StorageType.Default,
+                            location=None,
                             allow_conflicts=False,
                             transient=True,
                             strides=(1,),

From c14b91e59030fd8cf7a7edbeba60fbb77375082a Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 20 Nov 2024 11:54:04 +0100
Subject: [PATCH 13/51] Add tests

---
 dace/sdfg/validation.py      |  53 ++++++++++++
 tests/deferred_alloc_test.py | 154 +++++++++++++++++++++++++++++++++++
 2 files changed, 207 insertions(+)
 create mode 100644 tests/deferred_alloc_test.py

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 092ae78a2e..9b0b52c90c 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -542,6 +542,59 @@ def validate_state(state: 'dace.sdfg.SDFGState',
                             'written to, but only given to nested SDFG as an '
                             'input connector' % node.data, sdfg, state_id, nid)
 
+            # Deferred allocation related tests
+            insize = "_write_size"
+            outsize = "_read_size"
+            read_size_edges = list(state.edges_by_connector(node, outsize))
+            write_size_edges = list(state.edges_by_connector(node, insize))
+
+            # Reading-Writing the size is valid only if the array is transient and has the storage type CPU_Heap or GPU_Global
+            has_writes_or_reads = len(read_size_edges) + len(write_size_edges) > 0
+            size_access_allowed = arr.transient and (arr.storage == dtypes.StorageType.CPU_Heap or arr.storage == dtypes.StorageType.GPU_Global)
+            if has_writes_or_reads and not size_access_allowed:
+                raise InvalidSDFGNodeError('Reading the size of an array, or changing (writing to) the size of an array '
+                                           'is only valid if the array is transient and the storage is CPU_Heap or GPU_Global', sdfg, state_id, nid)
+
+            if len(write_size_edges) > 1:
+                raise InvalidSDFGNodeError('One node can have at maximum one edge writing to its size descriptior', sdfg, state_id, nid)
+
+            # The write needs to always have the same length of the dimension of the node
+            if len(write_size_edges) == 1:
+                write_size_edge = write_size_edges[0]
+                edge_id = state.edge_id(write_size_edge)
+                required_range = len(arr.shape)
+                try:
+                    elements = int(write_size_edge.data.num_elements())
+                    if elements != required_range or write_size_edge.data.subset.dims() != 1:
+                        raise Exception
+                except Exception:
+                    raise InvalidSDFGEdgeError('The write to a node needs to match the length of the array shape '
+                                                'the volume needs to be integer (not symbolic) and the shape one dimensional', sdfg, state_id, edge_id)
+
+            # Reads to map can be only scalars-sized
+            for read_size_edge in read_size_edges:
+                edge_id = state.edge_id(read_size_edge)
+                from dace import nodes
+                if (isinstance(read_size_edge.dst, nodes.EntryNode) or
+                    isinstance(read_size_edge.dst, nodes.AccessNode) or
+                    isinstance(read_size_edge.dst, nodes.Tasklet)):
+                    if isinstance(read_size_edge.dst, nodes.MapEntry):
+                        required_range = 1
+                        try:
+                            elements = int(read_size_edge.data.num_elements())
+                            if elements != required_range and read_size_edge.data.subset.dims() != 1:
+                                raise Exception()
+                        except Exception:
+                            raise InvalidSDFGEdgeError('The read to a map entry needs have dimension 1'
+                                                        'If reading multiple dimensions, multiple edges need to go to the map entry', sdfg, state_id, edge_id)
+                else:
+                    raise InvalidSDFGEdgeError('The read size should connect to an entry node, access node, or tasklet (this can be changed)'
+                                                , sdfg, state_id, edge_id)
+
+
+
+
+
         if (isinstance(node, nd.ConsumeEntry) and "IN_stream" not in node.in_connectors):
             raise InvalidSDFGNodeError("Consume entry node must have an input stream", sdfg, state_id, nid)
         if (isinstance(node, nd.ConsumeEntry) and "OUT_stream" not in node.out_connectors):
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
new file mode 100644
index 0000000000..5eed3d9173
--- /dev/null
+++ b/tests/deferred_alloc_test.py
@@ -0,0 +1,154 @@
+import dace
+import numpy
+
+def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"):
+    sdfg = dace.sdfg.SDFG(name="deferred_alloc_test")
+
+    sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type, transient=transient)
+
+    state = sdfg.add_state("main")
+
+    an_1 = state.add_access('A')
+    an_1.add_in_connector('_write_size')
+
+    an_2 = state.add_array(name="user_size", shape=(2,), dtype=numpy.uint64)
+
+    state.add_edge(an_2, None, an_1, '_write_size',
+                dace.Memlet(expr=f"user_size[{write_size}]") )
+
+    sdfg.save("def_alloc_1.sdfg")
+
+    return sdfg
+
+
+def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool):
+    sdfg = dace.sdfg.SDFG(name="deferred_alloc_test_4")
+
+    sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type,
+                    lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient)
+
+    state = sdfg.add_state("main")
+
+    an_1 = state.add_access('A')
+    an_1.add_in_connector('_write_size')
+    an_1.add_out_connector('_read_size')
+
+    an_2 = state.add_array(name="user_size", shape=(2,), dtype=numpy.uint64)
+
+    state.add_edge(an_2, None, an_1, '_write_size',
+                dace.Memlet(expr="user_size[0:2]") )
+
+    map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) })
+    state.add_edge(an_1, '_read_size', map_entry, "__A_dim1_size", dace.Memlet(expr="A_size[1]"))
+    map_entry.add_in_connector("__A_dim1_size")
+    map_exit.add_in_connector("IN_A")
+    map_exit.add_out_connector("OUT_A")
+
+    t1 = state.add_tasklet(name="assign", inputs={}, outputs={"_out"}, code="_out=3.0")
+    state.add_edge(map_entry, None, t1, None, dace.Memlet(None))
+    state.add_edge(t1, "_out", map_exit, "IN_A", dace.Memlet(expr="A[i, j]"))
+
+    an_3 = state.add_access('A')
+    state.add_edge(map_exit, "OUT_A", an_3, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)])))
+
+    an_3.add_out_connector('_read_size')
+    map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) })
+    state.add_edge(an_3, '_read_size', map_entry2, "__A_dim1_size", dace.Memlet(expr="A_size[1]"))
+    state.add_edge(an_3, None, map_entry2, "IN_A", dace.Memlet(expr="A[0:15, 0:__A_dim1_size]"))
+    map_entry2.add_in_connector("__A_dim1_size")
+    map_entry2.add_in_connector("IN_A")
+    map_entry2.add_out_connector("OUT_A")
+    map_exit2.add_in_connector("IN_A")
+    map_exit2.add_out_connector("OUT_A")
+
+    t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='if (_in != 5.0){ throw std::runtime_error("fail"); } \n _out=_in;', language=dace.dtypes.Language.CPP)
+    state.add_edge(map_entry2, "OUT_A", t2, "_in", dace.Memlet(expr="A[i, j]"))
+    state.add_edge(t2, "_out", map_exit2, "IN_A", dace.Memlet(expr="A[i, j]"))
+
+    an_5 = state.add_access('A')
+    state.add_edge(map_exit2, "OUT_A", an_5, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)])))
+
+    sdfg.save("def_alloc_4.sdfg")
+
+    return sdfg
+
+
+def _valid_to_reallocate(transient, storage_type, scope):
+    return transient and (storage_type == dace.dtypes.StorageType.GPU_Global or storage_type == dace.dtypes.StorageType.CPU_Heap)
+
+def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool):
+    sdfg = _get_trivial_alloc_sdfg(storage_type, transient)
+    try:
+        sdfg.validate()
+    except Exception:
+        if not _valid_to_reallocate(transient, storage_type, None):
+            return
+        else:
+            raise AssertionError("Realloc with transient data failed when it was expected not to.")
+
+    if not _valid_to_reallocate(transient, storage_type, None):
+        raise AssertionError("Realloc with non-transient data did not fail when it was expected to.")
+
+    sdfg.compile()
+
+def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool):
+    sdfg = _get_assign_map_sdfg(storage_type, transient)
+    try:
+        sdfg.validate()
+    except Exception:
+        if not _valid_to_reallocate(transient, storage_type, None):
+            return
+        else:
+            raise AssertionError("Realloc-use with transient data failed when it was expected not to.")
+
+    if not _valid_to_reallocate(transient, storage_type, None):
+        raise AssertionError("Realloc-use with non-transient data did not fail when it was expected to.")
+
+    sdfg.compile()
+
+def test_incomplete_write_dimensions_1():
+    sdfg =  _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
+    try:
+        sdfg.validate()
+    except Exception:
+        return
+
+    raise AssertionError("Realloc-use with transient data and incomplete write did not fail when it was expected to.")
+
+def test_incomplete_write_dimensions_2():
+    sdfg =  _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, False, "1:2")
+    try:
+        sdfg.validate()
+    except Exception:
+        return
+
+    raise AssertionError("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
+
+def test_realloc_inside_map():
+    pass
+
+if __name__ == "__main__":
+    for storage_type in [dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global]:
+        print(f"Trivial Realloc with storage {storage_type}")
+        test_trivial_realloc(storage_type, True)
+        print(f"Trivial Realloc-Use with storage {storage_type}")
+        test_realloc_use(storage_type, True)
+
+    for storage_type in [dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global]:
+        print(f"Trivial Realloc with storage {storage_type} on non-transient data")
+        test_trivial_realloc(storage_type, False)
+        print(f"Trivial Realloc-Use with storage {storage_type} on non-transient data")
+        test_realloc_use(storage_type, False)
+
+    # Try some other combinations
+    for transient in [True, False]:
+        for storage_type in [dace.dtypes.StorageType.Default, dace.dtypes.StorageType.Register]:
+            print(f"Trivial Realloc with storage {storage_type} on transient:{transient} data")
+            test_trivial_realloc(storage_type, transient)
+            print(f"Trivial Realloc-Use with storage {storage_type} on transient:{transient} data")
+            test_realloc_use(storage_type, transient)
+
+    print(f"Realloc with incomplete write 1")
+    test_incomplete_write_dimensions_1()
+    print(f"Realloc with incomplete write 2")
+    test_incomplete_write_dimensions_2()
\ No newline at end of file

From 506d0aaf2a8302b4ee5d9805acd0bf2c15d08cd5 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 20 Nov 2024 11:54:40 +0100
Subject: [PATCH 14/51] Change connector names

---
 dace/codegen/targets/cpu.py  |  4 ++--
 dace/codegen/targets/cuda.py |  2 +-
 dace/sdfg/sdfg.py            | 44 +++++++++++++++++++-----------------
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 163bbf933e..5eab35d384 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -1205,7 +1205,7 @@ def check_dace_defer(elements):
             elif isinstance(node, nodes.AccessNode):
                 if dst_node != node and not isinstance(dst_node, nodes.Tasklet) :
                     # If it is a size change, reallocate will be called
-                    if edge.dst_conn is not None and edge.dst_conn == "IN_size":
+                    if edge.dst_conn is not None and edge.dst_conn == "_write_size":
                         continue
 
                     dispatcher.dispatch_copy(
@@ -2269,7 +2269,7 @@ def _generate_AccessNode(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub
             if memlet_path[-1].dst == node:
                 src_node = memlet_path[0].src
 
-                if in_connector == "IN_size":
+                if in_connector == "_write_size":
                     self._dispatcher.dispatch_reallocate(
                         src_node,
                         node,
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index d70bb4097d..1feef7ba11 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1507,7 +1507,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         const_params = _get_const_params(dfg_scope)
         # make dynamic map inputs constant
         # TODO move this into _get_const_params(dfg_scope)
-        # Do not add src as const if the size is being red (src_conn is OUT_size)
+        # Do not add src as const if the size is being red (src_conn is _read_size)
         const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if not e.src_conn.endswith("size"))
 
         # Store init/exit code streams
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index e8cfe3426e..3f49506ad9 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1085,7 +1085,7 @@ def as_schedule_tree(self, in_place: bool = False) -> 'ScheduleTreeScope':
         the execution order of the SDFG.
         Each node in the tree can either represent a single statement (symbol assignment, tasklet, copy, library node,
         etc.) or a ``ScheduleTreeScope`` block (map, for-loop, pipeline, etc.) that contains other nodes.
-    
+
         It can be used to generate code from an SDFG, or to perform schedule transformations on the SDFG. For example,
         erasing an empty if branch, or merging two consecutive for-loops.
 
@@ -1753,20 +1753,21 @@ def add_array(self,
         if isinstance(dtype, type) and dtype in dtypes._CONSTANT_TYPES[:-1]:
             dtype = dtypes.typeclass(dtype)
 
-        size_desc = dt.Array(dtype=dace.uint64,
-                            shape=(len(shape),),
-                            storage=dtypes.StorageType.Default,
-                            location=None,
-                            allow_conflicts=False,
-                            transient=True,
-                            strides=(1,),
-                            offset=(0,),
-                            lifetime=lifetime,
-                            alignment=alignment,
-                            debuginfo=debuginfo,
-                            total_size=len(shape),
-                            may_alias=False,
-                            size_desc_name=None)
+        if transient:
+            size_desc = dt.Array(dtype=dace.uint64,
+                                shape=(len(shape),),
+                                storage=dtypes.StorageType.Default,
+                                location=None,
+                                allow_conflicts=False,
+                                transient=True,
+                                strides=(1,),
+                                offset=(0,),
+                                lifetime=lifetime,
+                                alignment=alignment,
+                                debuginfo=debuginfo,
+                                total_size=len(shape),
+                                may_alias=False,
+                                size_desc_name=None)
 
         desc = dt.Array(dtype=dtype,
                         shape=shape,
@@ -1784,11 +1785,12 @@ def add_array(self,
                         size_desc_name=None)
 
         array_name = self.add_datadesc(name, desc, find_new_name=find_new_name)
-        size_desc_name = f"{array_name}_size"
-        self.add_datadesc(size_desc_name, size_desc, find_new_name=False)
-        # In case find_new_name and a new name is returned
-        # we need to update the size descriptor name of the array
-        desc.size_desc_name = size_desc_name
+        if transient:
+            size_desc_name = f"{array_name}_size"
+            self.add_datadesc(size_desc_name, size_desc, find_new_name=False)
+            # In case find_new_name and a new name is returned
+            # we need to update the size descriptor name of the array
+            desc.size_desc_name = size_desc_name
         return array_name, desc
 
     def add_view(self,
@@ -2542,7 +2544,7 @@ def auto_optimize(self,
         """
         Runs a basic sequence of transformations to optimize a given SDFG to decent
         performance. In particular, performs the following:
-            
+
             * Simplify
             * Auto-parallelization (loop-to-map)
             * Greedy application of SubgraphFusion

From b956142c75868a570f036a47e6292a1b02b7030c Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 20 Nov 2024 12:48:04 +0100
Subject: [PATCH 15/51] Add more test cases and fix some bugs

---
 dace/codegen/targets/cpp.py  |  8 +++++
 dace/codegen/targets/cpu.py  | 69 ++++++++++++++++++++++--------------
 dace/subsets.py              | 30 ++++++++--------
 tests/deferred_alloc_test.py | 59 ++++++++++++++++++++----------
 4 files changed, 106 insertions(+), 60 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 3ed64d994b..494890089b 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -232,6 +232,14 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher',
         elif memlet.data == dst_node.data:
             copy_shape, src_strides = reshape_strides(dst_subset, dst_strides, src_strides, copy_shape)
 
+    def replace_dace_defer_dim(string, arrname):
+        pattern = r"__dace_defer_dim(\d+)"
+        return re.sub(pattern, r"A_size[\1]", string)
+
+    # TODO: do this better?
+    dst_expr = replace_dace_defer_dim(dst_expr, dst_node.data) if dst_expr is not None else None
+    src_expr = replace_dace_defer_dim(src_expr, src_node.data) if src_expr is not None else None
+
     return copy_shape, src_strides, dst_strides, src_expr, dst_expr
 
 
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 5eab35d384..1f7c8debaa 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -756,27 +756,34 @@ def _emit_copy(
 
         if isinstance(dst_node, nodes.Tasklet):
             # Copy into tasklet
+            desc = sdfg.arrays[memlet.data]
+            deferred_size_names = self._get_deferred_size_names(desc, memlet)
             stream.write(
-                "    " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn]),
+                "    " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn], deferred_size_names=deferred_size_names),
                 cfg,
                 state_id,
                 [src_node, dst_node],
             )
-            stream.write(
-                "//u1"
-            )
+            if deferred_size_names is not None:
+                stream.write(
+                    "// Size uses deferred allocation"
+                )
+
             return
         elif isinstance(src_node, nodes.Tasklet):
             # Copy out of tasklet
+            desc = sdfg.arrays[memlet.data]
+            deferred_size_names = self._get_deferred_size_names(desc, memlet)
             stream.write(
-                "    " + self.memlet_definition(sdfg, memlet, True, uconn, src_node.out_connectors[uconn]),
+                "    " + self.memlet_definition(sdfg, memlet, True, uconn, src_node.out_connectors[uconn], deferred_size_names=deferred_size_names),
                 cfg,
                 state_id,
                 [src_node, dst_node],
             )
-            stream.write(
-                "//u2"
-            )
+            if deferred_size_names is not None:
+                stream.write(
+                    "// Size uses deferred allocation"
+                )
             return
         else:  # Copy array-to-array
             src_nodedesc = src_node.desc(sdfg)
@@ -875,6 +882,7 @@ def _emit_copy(
 
             state_dfg: SDFGState = cfg.nodes()[state_id]
 
+
             copy_shape, src_strides, dst_strides, src_expr, dst_expr = cpp.memlet_copy_to_absolute_strides(
                 self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._packed_types)
 
@@ -1043,6 +1051,27 @@ def write_and_resolve_expr(self, sdfg: SDFG, memlet: mmlt.Memlet, nc: bool, outn
         custom_reduction = cpp.unparse_cr(sdfg, memlet.wcr, dtype)
         return (f'dace::wcr_custom<{dtype.ctype}>:: template {func}({custom_reduction}, {ptr}, {inname})')
 
+    def _get_deferred_size_names(self, desc, memlet):
+        if (desc.storage != dtypes.StorageType.GPU_Global and
+            desc.storage != dtypes.StorageType.CPU_Heap and
+            not desc.transient):
+            return None
+        def check_dace_defer(elements):
+            for elem in elements:
+                if isinstance(elem, symbolic.symbol) and str(elem).startswith("__dace_defer"):
+                    return True
+            return False
+        deferred_size_names = None
+        if check_dace_defer(desc.shape):
+            if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap:
+                deferred_size_names = []
+                for i, elem in enumerate(desc.shape):
+                    if str(elem).startswith("__dace_defer"):
+                        deferred_size_names.append(f"__{memlet.data}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]")
+                    else:
+                        deferred_size_names.append(elem)
+        return deferred_size_names if len(deferred_size_names) > 0 else None
+
     def process_out_memlets(self,
                             sdfg: SDFG,
                             cfg: ControlFlowRegion,
@@ -1179,22 +1208,7 @@ def process_out_memlets(self,
                             # If the storage type if CPU_Heap or GPU_Global then it might be requiring deferred allocation
                             # We can check if the array requires sepcial access using A_size[0] (CPU) or __A_dim0_size (GPU0)
                             # by going through the shape and checking for symbols starting with __dace_defer
-                            def check_dace_defer(elements):
-                                for elem in elements:
-                                    if isinstance(elem, symbolic.symbol) and str(elem).startswith("__dace_defer"):
-                                        return True
-                                return False
-                            deferred_size_names = None
-                            if check_dace_defer(desc.shape):
-                                if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap:
-                                    deferred_size_names = []
-                                    for i, elem in enumerate(desc.shape):
-                                        if str(elem).startswith("__dace_defer"):
-                                            deferred_size_names.append(f"__{memlet.data}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]")
-                                        else:
-                                            deferred_size_names.append(elem)
-                                else:
-                                    raise Exception("Deferred Allocation only supported on array storages of type GPU_Global or CPU_Heap")
+                            deferred_size_names = self._get_deferred_size_names(desc, memlet)
                             expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame, deferred_size_names=deferred_size_names)
                             write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype)
 
@@ -1332,7 +1346,8 @@ def memlet_definition(self,
                           local_name: str,
                           conntype: Union[data.Data, dtypes.typeclass] = None,
                           allow_shadowing: bool = False,
-                          codegen: 'CPUCodeGen' = None):
+                          codegen: 'CPUCodeGen' = None,
+                          deferred_size_names = None):
         # TODO: Robust rule set
         if conntype is None:
             raise ValueError('Cannot define memlet for "%s" without connector type' % local_name)
@@ -1381,7 +1396,7 @@ def memlet_definition(self,
                                 decouple_array_interfaces=decouple_array_interfaces)
 
         result = ''
-        expr = (cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame)
+        expr = (cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame, deferred_size_names=deferred_size_names)
                 if var_type in [DefinedType.Pointer, DefinedType.StreamArray, DefinedType.ArrayInterface] else ptr)
 
         if expr != ptr:
@@ -1425,7 +1440,7 @@ def memlet_definition(self,
             if not memlet.dynamic and memlet.num_accesses == 1:
                 if not output:
                     if isinstance(desc, data.Stream) and desc.is_stream_array():
-                        index = cpp.cpp_offset_expr(desc, memlet.subset)
+                        index = cpp.cpp_offset_expr(desc, memlet.subset, deferred_size_names=deferred_size_names)
                         expr = f"{memlet.data}[{index}]"
                     result += f'{memlet_type} {local_name} = ({expr}).pop();'
                     defined = DefinedType.Scalar
diff --git a/dace/subsets.py b/dace/subsets.py
index 0fdc36c22e..e6d69e1a67 100644
--- a/dace/subsets.py
+++ b/dace/subsets.py
@@ -99,7 +99,7 @@ def covers(self, other):
                 return False
 
             return True
-        
+
     def covers_precise(self, other):
         """ Returns True if self contains all the elements in other. """
 
@@ -734,7 +734,7 @@ def compose(self, other):
     def squeeze(self, ignore_indices: Optional[List[int]] = None, offset: bool = True) -> List[int]:
         """
         Removes size-1 ranges from the subset and returns a list of dimensions that remain.
-        
+
         For example, ``[i:i+10, j]`` will change the range to ``[i:i+10]`` and return ``[0]``.
         If ``offset`` is True, the subset will become ``[0:10]``.
 
@@ -770,7 +770,7 @@ def squeeze(self, ignore_indices: Optional[List[int]] = None, offset: bool = Tru
 
     def unsqueeze(self, axes: Sequence[int]) -> List[int]:
         """ Adds 0:1 ranges to the subset, in the indices contained in axes.
-        
+
         The method is mostly used to restore subsets that had their length-1
         ranges removed (i.e., squeezed subsets). Hence, the method is
         called 'unsqueeze'.
@@ -1046,7 +1046,7 @@ def squeeze(self, ignore_indices=None):
 
     def unsqueeze(self, axes: Sequence[int]) -> List[int]:
         """ Adds zeroes to the subset, in the indices contained in axes.
-        
+
         The method is mostly used to restore subsets that had their
         zero-indices removed (i.e., squeezed subsets). Hence, the method is
         called 'unsqueeze'.
@@ -1112,7 +1112,7 @@ def __init__(self, subset):
             self.subset_list = [subset]
 
     def covers(self, other):
-        """ 
+        """
         Returns True if this SubsetUnion covers another subset (using a bounding box).
         If other is another SubsetUnion then self and other will
         only return true if self is other. If other is a different type of subset
@@ -1128,13 +1128,13 @@ def covers(self, other):
             return False
         else:
             return any(s.covers(other) for s in self.subset_list)
-        
+
     def covers_precise(self, other):
-        """ 
+        """
         Returns True if this SubsetUnion covers another
         subset. If other is another SubsetUnion then self and other will
         only return true if self is other. If other is a different type of subset
-        true is returned when one of the subsets in self is equal to other 
+        true is returned when one of the subsets in self is equal to other
         """
 
         if isinstance(other, SubsetUnion):
@@ -1154,7 +1154,7 @@ def __str__(self):
                 string += " "
             string += subset.__str__()
         return string
-    
+
     def dims(self):
         if not self.subset_list:
             return 0
@@ -1178,7 +1178,7 @@ def free_symbols(self) -> Set[str]:
         for subset in self.subset_list:
             result |= subset.free_symbols
         return result
-    
+
     def replace(self, repl_dict):
         for subset in self.subset_list:
             subset.replace(repl_dict)
@@ -1192,15 +1192,15 @@ def num_elements(self):
                     min = subset.num_elements()
             except:
                 continue
-            
+
         return min
 
 
 
 def _union_special_cases(arb: symbolic.SymbolicType, brb: symbolic.SymbolicType, are: symbolic.SymbolicType,
                          bre: symbolic.SymbolicType):
-    """ 
-    Special cases of subset unions. If case found, returns pair of 
+    """
+    Special cases of subset unions. If case found, returns pair of
     (min,max), otherwise returns None.
     """
     if are + 1 == brb:
@@ -1267,7 +1267,7 @@ def union(subset_a: Subset, subset_b: Subset) -> Subset:
     """ Compute the union of two Subset objects.
         If the subsets are not of the same type, degenerates to bounding-box
         union.
-        
+
         :param subset_a: The first subset.
         :param subset_b: The second subset.
         :return: A Subset object whose size is at least the union of the two
@@ -1303,7 +1303,7 @@ def union(subset_a: Subset, subset_b: Subset) -> Subset:
 
 
 def list_union(subset_a: Subset, subset_b: Subset) -> Subset:
-    """ 
+    """
     Returns the union of two Subset lists.
 
     :param subset_a: The first subset.
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 5eed3d9173..d2cf87168f 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -1,5 +1,6 @@
 import dace
 import numpy
+import cupy
 
 def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"):
     sdfg = dace.sdfg.SDFG(name="deferred_alloc_test")
@@ -11,17 +12,15 @@ def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bo
     an_1 = state.add_access('A')
     an_1.add_in_connector('_write_size')
 
-    an_2 = state.add_array(name="user_size", shape=(2,), dtype=numpy.uint64)
+    an_2 = state.add_array(name="user_size", shape=(2,), dtype=dace.uint64)
 
     state.add_edge(an_2, None, an_1, '_write_size',
                 dace.Memlet(expr=f"user_size[{write_size}]") )
 
-    sdfg.save("def_alloc_1.sdfg")
-
     return sdfg
 
 
-def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool):
+def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType.Default):
     sdfg = dace.sdfg.SDFG(name="deferred_alloc_test_4")
 
     sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type,
@@ -33,12 +32,13 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool)
     an_1.add_in_connector('_write_size')
     an_1.add_out_connector('_read_size')
 
-    an_2 = state.add_array(name="user_size", shape=(2,), dtype=numpy.uint64)
+    an_2 = state.add_array(name="user_size", shape=(2,), dtype=dace.uint64)
 
     state.add_edge(an_2, None, an_1, '_write_size',
                 dace.Memlet(expr="user_size[0:2]") )
 
-    map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) })
+    map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) },
+                                        schedule=schedule_type)
     state.add_edge(an_1, '_read_size', map_entry, "__A_dim1_size", dace.Memlet(expr="A_size[1]"))
     map_entry.add_in_connector("__A_dim1_size")
     map_exit.add_in_connector("IN_A")
@@ -51,8 +51,17 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool)
     an_3 = state.add_access('A')
     state.add_edge(map_exit, "OUT_A", an_3, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)])))
 
+    arr_name, arr = sdfg.add_array(name="example_array", dtype=dace.float32, shape=(1,), transient=False, storage=storage_type)
+    arrn = state.add_access(arr_name)
+
+    if storage_type == dace.dtypes.StorageType.CPU_Heap:
+        assert (schedule_type == dace.dtypes.ScheduleType.Sequential)
+    elif storage_type == dace.dtypes.StorageType.GPU_Global:
+        assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device)
+
     an_3.add_out_connector('_read_size')
-    map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)]) })
+    map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)])},
+                                            schedule=schedule_type)
     state.add_edge(an_3, '_read_size', map_entry2, "__A_dim1_size", dace.Memlet(expr="A_size[1]"))
     state.add_edge(an_3, None, map_entry2, "IN_A", dace.Memlet(expr="A[0:15, 0:__A_dim1_size]"))
     map_entry2.add_in_connector("__A_dim1_size")
@@ -61,14 +70,14 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool)
     map_exit2.add_in_connector("IN_A")
     map_exit2.add_out_connector("OUT_A")
 
-    t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='if (_in != 5.0){ throw std::runtime_error("fail"); } \n _out=_in;', language=dace.dtypes.Language.CPP)
+    t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='_out = _in', language=dace.dtypes.Language.Python)
     state.add_edge(map_entry2, "OUT_A", t2, "_in", dace.Memlet(expr="A[i, j]"))
     state.add_edge(t2, "_out", map_exit2, "IN_A", dace.Memlet(expr="A[i, j]"))
 
     an_5 = state.add_access('A')
     state.add_edge(map_exit2, "OUT_A", an_5, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,15-1, 1), (0,"__A_dim1_size-1", 1)])))
 
-    sdfg.save("def_alloc_4.sdfg")
+    state.add_edge(an_5, None, arrn, None, dace.memlet.Memlet("A[7, 7]"))
 
     return sdfg
 
@@ -91,8 +100,8 @@ def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool)
 
     sdfg.compile()
 
-def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool):
-    sdfg = _get_assign_map_sdfg(storage_type, transient)
+def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType):
+    sdfg = _get_assign_map_sdfg(storage_type, transient, schedule_type)
     try:
         sdfg.validate()
     except Exception:
@@ -104,7 +113,18 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool):
     if not _valid_to_reallocate(transient, storage_type, None):
         raise AssertionError("Realloc-use with non-transient data did not fail when it was expected to.")
 
-    sdfg.compile()
+    compiled_sdfg = sdfg.compile()
+    if storage_type == dace.dtypes.StorageType.CPU_Heap:
+        arr = numpy.array([-1.0]).astype(numpy.float32)
+        user_size = numpy.array([10, 10]).astype(numpy.uint64)
+        compiled_sdfg (user_size=user_size, example_array=arr)
+        assert ( arr[0] == 3.0 )
+    if storage_type == dace.dtypes.StorageType.GPU_Global:
+        arr = cupy.array([-1.0]).astype(cupy.float32)
+        user_size = numpy.array([10, 10]).astype(numpy.uint64)
+        compiled_sdfg (user_size=user_size, example_array=arr)
+        assert ( arr.get()[0] == 3.0 )
+
 
 def test_incomplete_write_dimensions_1():
     sdfg =  _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
@@ -128,25 +148,28 @@ def test_realloc_inside_map():
     pass
 
 if __name__ == "__main__":
-    for storage_type in [dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global]:
+    for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential),
+                                        (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]:
         print(f"Trivial Realloc with storage {storage_type}")
         test_trivial_realloc(storage_type, True)
         print(f"Trivial Realloc-Use with storage {storage_type}")
-        test_realloc_use(storage_type, True)
+        test_realloc_use(storage_type, True, schedule_type)
 
-    for storage_type in [dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global]:
+    for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential),
+                                        (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]:
         print(f"Trivial Realloc with storage {storage_type} on non-transient data")
         test_trivial_realloc(storage_type, False)
         print(f"Trivial Realloc-Use with storage {storage_type} on non-transient data")
-        test_realloc_use(storage_type, False)
+        test_realloc_use(storage_type, False, schedule_type)
 
     # Try some other combinations
     for transient in [True, False]:
-        for storage_type in [dace.dtypes.StorageType.Default, dace.dtypes.StorageType.Register]:
+        for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential),
+                                            (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]:
             print(f"Trivial Realloc with storage {storage_type} on transient:{transient} data")
             test_trivial_realloc(storage_type, transient)
             print(f"Trivial Realloc-Use with storage {storage_type} on transient:{transient} data")
-            test_realloc_use(storage_type, transient)
+            test_realloc_use(storage_type, transient, schedule_type)
 
     print(f"Realloc with incomplete write 1")
     test_incomplete_write_dimensions_1()

From 82cdfdee879ec842276a8d7ae2812b6a5cc88d6d Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 12:21:07 +0100
Subject: [PATCH 16/51] Bug fixes

---
 dace/codegen/targets/cpu.py    | 14 ++++++++------
 dace/codegen/targets/cuda.py   | 18 +++++++++---------
 dace/frontend/python/newast.py | 16 ++++++++--------
 3 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 1f7c8debaa..ecec9d2a90 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -340,11 +340,13 @@ def declare_array(self,
             declaration_stream.write(f'{nodedesc.dtype.ctype} *{name} = nullptr;\n', cfg, state_id, node)
             self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef)
 
-            size_arr_name = sdfg.arrays[name].size_desc_name
-            size_arr_desc = sdfg.arrays[size_arr_name]
-            size_ctypedef = dtypes.pointer(size_arr_desc.dtype).ctype
-
-            self._dispatcher.declared_arrays.add(size_arr_name, DefinedType.Pointer, size_ctypedef)
+            # Size desc is defined only for transient arrays
+            if nodedesc.transient and nodedesc.storage == dtypes.StorageType.CPU_Heap:
+                size_desc_name = sdfg.arrays[name].size_desc_name
+                if size_desc_name is not None:
+                    size_desc = sdfg.arrays[size_desc_name]
+                    size_ctypedef = dtypes.pointer(size_desc.dtype).ctype
+                    self._dispatcher.declared_arrays.add(size_desc_name, DefinedType.Pointer, size_ctypedef)
             return
         elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal:
             # Define pointer once
@@ -1070,7 +1072,7 @@ def check_dace_defer(elements):
                         deferred_size_names.append(f"__{memlet.data}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]")
                     else:
                         deferred_size_names.append(elem)
-        return deferred_size_names if len(deferred_size_names) > 0 else None
+        return deferred_size_names if deferred_size_names is not None and len(deferred_size_names) > 0 else None
 
     def process_out_memlets(self,
                             sdfg: SDFG,
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 436e4f0fca..7793ddc92d 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -610,10 +610,12 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
             if not declared:
                 result_decl.write('%s %s;\n' % (ctypedef, dataname))
                 size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
-                size_desc_name = nodedesc.size_desc_name
-                size_nodedesc = sdfg.arrays[size_desc_name]
-                result_decl.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n')
-                self._dispatcher.defined_vars.add(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
+                if nodedesc.transient:
+                    size_desc_name = nodedesc.size_desc_name
+                    if size_desc_name is not None:
+                        size_nodedesc = sdfg.arrays[size_desc_name]
+                        result_decl.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n')
+                        self._dispatcher.defined_vars.add(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
 
 
@@ -1481,7 +1483,6 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             if isinstance(node, nodes.AccessNode):
                 nsdfg: SDFG = parent.parent
                 desc = node.desc(nsdfg)
-                sizedesc = nsdfg.arrays[desc.size_desc_name]
                 if (nsdfg, node.data) in visited:
                     continue
                 visited.add((nsdfg, node.data))
@@ -1584,10 +1585,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
 
             if aname in sdfg.arrays:
                 size_arr_name = data_desc.size_desc_name
-                size_arr = sdfg.arrays[data_desc.size_desc_name]
-                size_arr_len = size_arr.shape[0]
-                size_arr_dtype = size_arr.dtype.ctype
-                host_size_args[size_arr_name] = size_arr
+                if size_arr_name is not None:
+                    size_arr = sdfg.arrays[data_desc.size_desc_name]
+                    host_size_args[size_arr_name] = size_arr
 
         kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k)
                              for k, v in prototype_kernel_args.items()]
diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 1cbb8e67c9..255bd26983 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -189,13 +189,13 @@ def parse_dace_program(name: str,
     Parses a ``@dace.program`` function into an SDFG.
 
     :param src_ast: The AST of the Python program to parse.
-    :param visitor: A ProgramVisitor object returned from 
+    :param visitor: A ProgramVisitor object returned from
                     ``preprocess_dace_program``.
     :param closure: An object that contains the @dace.program closure.
     :param simplify: If True, simplification pass will be performed.
     :param save: If True, saves source mapping data for this SDFG.
-    :param progress: If True, prints a progress bar of the parsing process. 
-                        If None (default), prints after 5 seconds of parsing. 
+    :param progress: If True, prints a progress bar of the parsing process.
+                        If None (default), prints after 5 seconds of parsing.
                         If False, never prints progress.
     :return: A 2-tuple of SDFG and its reduced (used) closure.
     """
@@ -1466,8 +1466,8 @@ def _parse_subprogram(self, name, node, is_tasklet=False, extra_symbols=None, ex
     def _symbols_from_params(self, params: List[Tuple[str, Union[str, dtypes.typeclass]]],
                              memlet_inputs: Dict[str, Memlet]) -> Dict[str, symbolic.symbol]:
         """
-        Returns a mapping between symbol names to their type, as a symbol 
-        object to maintain compatibility with global symbols. Used to maintain 
+        Returns a mapping between symbol names to their type, as a symbol
+        object to maintain compatibility with global symbols. Used to maintain
         typed symbols in SDFG scopes (e.g., map, consume).
         """
         from dace.codegen.tools.type_inference import infer_expr_type
@@ -1900,7 +1900,7 @@ def _parse_map_inputs(self, name: str, params: List[Tuple[str, str]],
 
     def _parse_consume_inputs(self, node: ast.FunctionDef) -> Tuple[str, str, Tuple[str, str], str, str]:
         """ Parse consume parameters from AST.
-        
+
             :return: A 5-tuple of Stream name, internal stream name,
                      (PE index, number of PEs), condition, chunk size.
         """
@@ -2179,7 +2179,7 @@ def _add_dependencies(self,
                 state.add_nedge(internal_node, exit_node, dace.Memlet())
 
     def _add_nested_symbols(self, nsdfg_node: nodes.NestedSDFG):
-        """ 
+        """
         Adds symbols from nested SDFG mapping values (if appear as globals)
         to current SDFG.
         """
@@ -4769,7 +4769,7 @@ def visit_With(self, node: ast.With, is_async=False):
                 evald = astutils.evalnode(node.items[0].context_expr, self.globals)
                 if hasattr(evald, "name"):
                     named_region_name: str = evald.name
-                else:            
+                else:
                     named_region_name = f"Named Region {node.lineno}"
                 named_region = NamedRegion(named_region_name, debuginfo=self.current_lineinfo)
                 self.cfg_target.add_node(named_region)

From 97bc728c8aa19cb9fddfb073d6b290b95a9ad60b Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 12:30:25 +0100
Subject: [PATCH 17/51] More codegen fixes

---
 dace/sdfg/validation.py      |  9 +++++----
 tests/deferred_alloc_test.py | 34 ++++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 2e01b5883f..5212147c03 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -747,8 +747,8 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         if isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Structure):
             name = None
         # Special case: if the name is the size array of the src_node, then it is ok, checked with the "size_desc_name"
-        src_size_access = isinstance(src_node, nd.AccessNode) and name == sdfg.arrays[src_node.data].size_desc_name
-        dst_size_access = isinstance(dst_node, nd.AccessNode) and name == sdfg.arrays[dst_node.data].size_desc_name
+        src_size_access = isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Array) and name is not None and name == sdfg.arrays[src_node.data].size_desc_name
+        dst_size_access = isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Array) and name is not None and name == sdfg.arrays[dst_node.data].size_desc_name
         sdict = state.scope_dict()
         if src_size_access and dst_size_access:
             raise InvalidSDFGEdgeError(
@@ -766,9 +766,10 @@ def validate_state(state: 'dace.sdfg.SDFGState',
             )
         if dst_size_access:
             dst_arr = sdfg.arrays[dst_node.data]
-            if dst_arr.storage != dace.dtypes.StorageType.GPU_Global or dst_arr.storage != dace.dtypes.StorageType.CPU_Heap:
+            if (dst_arr.storage != dtypes.StorageType.GPU_Global and
+                dst_arr.storage != dtypes.StorageType.CPU_Heap):
                 raise InvalidSDFGEdgeError(
-                    "Reallocating data (writing to the size connector) within a scope is not valid",
+                    f"Reallocating data is allowed only to GPU_Global or CPU_Heap, the storage type is {dst_arr.storage}",
                     sdfg,
                     state_id,
                     eid,
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index d2cf87168f..6459ee6105 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -1,6 +1,22 @@
 import dace
 import numpy
 import cupy
+import pytest
+
+@pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global])
+def storage_type(request):
+    return request.param
+
+@pytest.fixture(params=[True, False])
+def transient(request):
+    return request.param
+
+@pytest.fixture
+def schedule_type(storage_type):
+    if storage_type == dace.dtypes.StorageType.CPU_Heap:
+        return dace.dtypes.ScheduleType.Sequential
+    elif storage_type == dace.dtypes.StorageType.GPU_Global:
+        return dace.dtypes.ScheduleType.GPU_Device
 
 def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"):
     sdfg = dace.sdfg.SDFG(name="deferred_alloc_test")
@@ -126,26 +142,32 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch
         assert ( arr.get()[0] == 3.0 )
 
 
+def test_realloc_inside_map():
+    pass
+
+
+def test_all_combinations(storage_type, transient, schedule_type):
+    test_trivial_realloc(storage_type, transient)
+    test_realloc_use(storage_type, transient, schedule_type)
+
 def test_incomplete_write_dimensions_1():
-    sdfg =  _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
+    sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
     try:
         sdfg.validate()
     except Exception:
         return
 
-    raise AssertionError("Realloc-use with transient data and incomplete write did not fail when it was expected to.")
+    pytest.fail("Realloc-use with transient data and incomplete write did not fail when it was expected to.")
 
 def test_incomplete_write_dimensions_2():
-    sdfg =  _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, False, "1:2")
+    sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, False, "1:2")
     try:
         sdfg.validate()
     except Exception:
         return
 
-    raise AssertionError("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
+    pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
 
-def test_realloc_inside_map():
-    pass
 
 if __name__ == "__main__":
     for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential),

From 08cb50c5bb3290265396eb5e4449532e343b0c98 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 13:15:48 +0100
Subject: [PATCH 18/51] Split size and array storage

---
 dace/codegen/targets/cpu.py  |  5 +--
 dace/codegen/targets/cuda.py |  5 +--
 dace/sdfg/sdfg.py            | 71 +++++++++++++++++++++++++++++-------
 3 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index ecec9d2a90..613b4f8557 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -344,7 +344,7 @@ def declare_array(self,
             if nodedesc.transient and nodedesc.storage == dtypes.StorageType.CPU_Heap:
                 size_desc_name = sdfg.arrays[name].size_desc_name
                 if size_desc_name is not None:
-                    size_desc = sdfg.arrays[size_desc_name]
+                    size_desc = sdfg.size_arrays[size_desc_name]
                     size_ctypedef = dtypes.pointer(size_desc.dtype).ctype
                     self._dispatcher.declared_arrays.add(size_desc_name, DefinedType.Pointer, size_ctypedef)
             return
@@ -514,7 +514,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                 # Initialize size array
                 size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
                 size_desc_name = nodedesc.size_desc_name
-                size_nodedesc = sdfg.arrays[size_desc_name]
+                size_nodedesc = sdfg.size_arrays[size_desc_name]
                 declaration_stream.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
             if deferred_allocation:
                 allocation_stream.write(
@@ -708,7 +708,6 @@ def reallocate(
         data = sdfg.arrays[data_name]
         size_array_name = data.size_desc_name
 
-        new_size_array = sdfg.arrays[new_size_array_name]
         dtype = sdfg.arrays[data_name].dtype
 
         # Only consider the offsets with __dace_defer in original dim
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 7793ddc92d..e5ad6dc9dc 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -613,7 +613,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                 if nodedesc.transient:
                     size_desc_name = nodedesc.size_desc_name
                     if size_desc_name is not None:
-                        size_nodedesc = sdfg.arrays[size_desc_name]
+                        size_nodedesc = sdfg.size_arrays[size_desc_name]
                         result_decl.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n')
                         self._dispatcher.defined_vars.add(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
@@ -1586,7 +1586,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             if aname in sdfg.arrays:
                 size_arr_name = data_desc.size_desc_name
                 if size_arr_name is not None:
-                    size_arr = sdfg.arrays[data_desc.size_desc_name]
+                    size_arr = sdfg.size_arrays[data_desc.size_desc_name]
                     host_size_args[size_arr_name] = size_arr
 
         kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k)
@@ -2796,7 +2796,6 @@ def reallocate(
         data = sdfg.arrays[data_name]
         size_array_name = data.size_desc_name
 
-        new_size_array = sdfg.arrays[new_size_array_name]
         dtype = sdfg.arrays[data_name].dtype
 
         # Only consider the offsets with __dace_defer in original dim
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 5006280c6e..8d8ea82484 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -418,6 +418,10 @@ class SDFG(ControlFlowRegion):
                        desc="Data descriptors for this SDFG",
                        to_json=_arrays_to_json,
                        from_json=_nested_arrays_from_json)
+    _size_arrays = Property(dtype=NestedDict,
+                       desc="Data size descriptors for this SDFG",
+                       to_json=_arrays_to_json,
+                       from_json=_nested_arrays_from_json)
     symbols = DictProperty(str, dtypes.typeclass, desc="Global symbols for this SDFG")
 
     instrument = EnumProperty(dtype=dtypes.InstrumentationType,
@@ -496,6 +500,7 @@ def __init__(self,
         self._parent_sdfg = None
         self._parent_nsdfg_node = None
         self._arrays = NestedDict()  # type: Dict[str, dt.Array]
+        self._size_arrays = NestedDict()
         self.arg_names = []
         self._labels: Set[str] = set()
         self.global_code = {'frame': CodeBlock("", dtypes.Language.CPP)}
@@ -683,6 +688,10 @@ def arrays(self):
         """
         return self._arrays
 
+    @property
+    def size_arrays(self):
+        return self._size_arrays
+
     @property
     def process_grids(self):
         """ Returns a dictionary of process-grid descriptors (`ProcessGrid` objects) used in this SDFG. """
@@ -746,6 +755,7 @@ def replace_dict(self,
             for name, new_name in repldict_filtered.items():
                 if validate_name(new_name):
                     _replace_dict_keys(self._arrays, name, new_name)
+                    _replace_dict_keys(self._size_arrays, name + "_size", new_name + "_size")
                     _replace_dict_keys(self.symbols, name, new_name)
                     _replace_dict_keys(self.constants_prop, name, new_name)
                     _replace_dict_keys(self.callback_mapping, name, new_name)
@@ -1151,7 +1161,10 @@ def remove_data(self, name, validate=True):
                                          f"{name}: it is accessed by node "
                                          f"{node} in state {state}.")
 
+        size_desc_name = self._arrays[name].size_desc_name
         del self._arrays[name]
+        if size_desc_name is not None:
+            del self._size_arrays[size_desc_name]
 
     def reset_sdfg_list(self):
         """
@@ -1676,13 +1689,15 @@ def _find_new_name(self, name: str):
         """ Tries to find a new name by adding an underscore and a number. """
 
         names = (self._arrays.keys() | self.constants_prop.keys() | self._pgrids.keys() | self._subarrays.keys()
-                 | self._rdistrarrays.keys() | self.symbols.keys())
+                 | self._rdistrarrays.keys() | self.symbols.keys() | self._size_arrays.keys())
         return dt.find_new_name(name, names)
 
     def is_name_used(self, name: str) -> bool:
         """ Checks if `name` is already used inside the SDFG."""
         if name in self._arrays:
             return True
+        if name in self._size_arrays:
+            return True
         if name in self.symbols:
             return True
         if name in self.constants_prop:
@@ -1787,7 +1802,7 @@ def add_array(self,
         array_name = self.add_datadesc(name, desc, find_new_name=find_new_name)
         if transient:
             size_desc_name = f"{array_name}_size"
-            self.add_datadesc(size_desc_name, size_desc, find_new_name=False)
+            self.add_size_datadesc(size_desc_name, size_desc)
             # In case find_new_name and a new name is returned
             # we need to update the size descriptor name of the array
             desc.size_desc_name = size_desc_name
@@ -2038,6 +2053,16 @@ def add_temp_transient_like(self, desc: Union[dt.Array, dt.Scalar], dtype=None,
         newdesc.debuginfo = debuginfo
         return self.add_datadesc(self.temp_data_name(), newdesc), newdesc
 
+    @staticmethod
+    def _add_symbols(sdfg, desc: dt.Data):
+        if isinstance(desc, dt.Structure):
+            for v in desc.members.values():
+                if isinstance(v, dt.Data):
+                    SDFG._add_symbols(sdfg, v)
+        for sym in desc.free_symbols:
+            if sym.name not in sdfg.symbols:
+                sdfg.add_symbol(sym.name, sym.dtype)
+
     def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str:
         """ Adds an existing data descriptor to the SDFG array store.
 
@@ -2067,7 +2092,7 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
         else:
             # We do not check for data constant, because there is a link between the constants and
             #  the data descriptors.
-            if name in self.arrays:
+            if name in self.arrays or name in self.size_arrays:
                 raise FileExistsError(f'Data descriptor "{name}" already exists in SDFG')
             if name in self.symbols:
                 raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a symbol.')
@@ -2078,18 +2103,38 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
             if name in self._pgrids:
                 raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a ProcessGrid.')
 
-        def _add_symbols(sdfg: SDFG, desc: dt.Data):
-            if isinstance(desc, dt.Structure):
-                for v in desc.members.values():
-                    if isinstance(v, dt.Data):
-                        _add_symbols(sdfg, v)
-            for sym in desc.free_symbols:
-                if sym.name not in sdfg.symbols:
-                    sdfg.add_symbol(sym.name, sym.dtype)
-
         # Add the data descriptor to the SDFG and all symbols that are not yet known.
         self._arrays[name] = datadesc
-        _add_symbols(self, datadesc)
+        SDFG._add_symbols(self, datadesc)
+
+        return name
+
+    def add_size_datadesc(self, name: str, datadesc: dt.Data) -> str:
+        """ Adds an existing data descriptor to the SDFG array store.
+
+            :param name: Name to use.
+            :param datadesc: Data descriptor to add.
+            :param find_new_name: If True and data descriptor with this name
+                                  exists, finds a new name to add.
+            :return: Name of the new data descriptor
+        """
+        if not isinstance(name, str):
+            raise TypeError("Data descriptor name must be a string. Got %s" % type(name).__name__)
+
+        if name in self.arrays or name in self.size_arrays:
+            raise FileExistsError(f'Data descriptor "{name}" already exists in SDFG')
+        if name in self.symbols:
+            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a symbol.')
+        if name in self._subarrays:
+            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a subarray.')
+        if name in self._rdistrarrays:
+            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a RedistrArray.')
+        if name in self._pgrids:
+            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a ProcessGrid.')
+
+        # Add the data descriptor to the SDFG and all symbols that are not yet known.
+        self._size_arrays[name] = datadesc
+        SDFG._add_symbols(self, datadesc)
 
         return name
 

From ac90c86cdc8ead071b04649c388df5ea8949320d Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 16:20:53 +0100
Subject: [PATCH 19/51] Major fixes regarding name changes etc.

---
 dace/codegen/targets/cpu.py                   |  13 +-
 dace/codegen/targets/cuda.py                  |   4 +-
 dace/data.py                                  |   5 +-
 dace/memlet.py                                |  40 +++---
 dace/sdfg/infer_types.py                      |  12 +-
 dace/sdfg/replace.py                          |   4 +-
 dace/sdfg/sdfg.py                             | 130 ++++++++----------
 dace/sdfg/validation.py                       |  11 +-
 .../dataflow/redundant_array.py               |   2 +-
 .../passes/array_elimination.py               |  12 +-
 tests/deferred_alloc_test.py                  |  31 ++++-
 11 files changed, 144 insertions(+), 120 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 613b4f8557..6a2b89e4ae 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -319,7 +319,6 @@ def declare_array(self,
 
         name = node.root_data
         ptrname = cpp.ptr(name, nodedesc, sdfg, self._frame)
-        print("D2", name, nodedesc)
 
         if nodedesc.transient is False:
             return
@@ -344,7 +343,7 @@ def declare_array(self,
             if nodedesc.transient and nodedesc.storage == dtypes.StorageType.CPU_Heap:
                 size_desc_name = sdfg.arrays[name].size_desc_name
                 if size_desc_name is not None:
-                    size_desc = sdfg.size_arrays[size_desc_name]
+                    size_desc = sdfg.arrays[size_desc_name]
                     size_ctypedef = dtypes.pointer(size_desc.dtype).ctype
                     self._dispatcher.declared_arrays.add(size_desc_name, DefinedType.Pointer, size_ctypedef)
             return
@@ -513,9 +512,13 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                 declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node)
                 # Initialize size array
                 size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
-                size_desc_name = nodedesc.size_desc_name
-                size_nodedesc = sdfg.size_arrays[size_desc_name]
-                declaration_stream.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
+                if (nodedesc.transient and (
+                    nodedesc.storage == dtypes.StorageType.CPU_Heap or
+                    nodedesc.storage == dtypes.StorageType.GPU_Global)
+                    ):
+                    size_desc_name = nodedesc.size_desc_name
+                    size_nodedesc = sdfg.arrays[size_desc_name]
+                    declaration_stream.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
             if deferred_allocation:
                 allocation_stream.write(
                     "%s = nullptr; // Deferred Allocation" %
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index e5ad6dc9dc..bbc485c336 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -613,7 +613,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                 if nodedesc.transient:
                     size_desc_name = nodedesc.size_desc_name
                     if size_desc_name is not None:
-                        size_nodedesc = sdfg.size_arrays[size_desc_name]
+                        size_nodedesc = sdfg.arrays[size_desc_name]
                         result_decl.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n')
                         self._dispatcher.defined_vars.add(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
@@ -1586,7 +1586,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             if aname in sdfg.arrays:
                 size_arr_name = data_desc.size_desc_name
                 if size_arr_name is not None:
-                    size_arr = sdfg.size_arrays[data_desc.size_desc_name]
+                    size_arr = sdfg.arrays[data_desc.size_desc_name]
                     host_size_args[size_arr_name] = size_arr
 
         kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k)
diff --git a/dace/data.py b/dace/data.py
index 8a606eac77..355532208b 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -183,6 +183,7 @@ def _transient_setter(self, value):
                             default=dtypes.AllocationLifetime.Scope)
     location = DictProperty(key_type=str, value_type=str, desc='Full storage location identifier (e.g., rank, GPU ID)')
     debuginfo = DebugInfoProperty(allow_none=True)
+    size_desc_name = Property(dtype=str, default=None, allow_none=True)
 
     def __init__(self, dtype, shape, transient, storage, location, lifetime, debuginfo):
         self.dtype = dtype
@@ -192,6 +193,7 @@ def __init__(self, dtype, shape, transient, storage, location, lifetime, debugin
         self.location = location if location is not None else {}
         self.lifetime = lifetime
         self.debuginfo = debuginfo
+        self.size_desc_name = None
         self._validate()
 
     def __call__(self):
@@ -1385,9 +1387,6 @@ class Array(Data):
                         'it is inferred by other properties and the OptionalArrayInference pass.')
     pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred')
 
-    size_desc_name = Property(dtype=str, default=None, allow_none=True, desc='The name of the size array (1D, length is the shape of thte current array)'
-                                                                             'Of the array (usually <name>_size)')
-
     def __init__(self,
                  dtype,
                  shape,
diff --git a/dace/memlet.py b/dace/memlet.py
index 85bd0a348d..8d396d8e4c 100644
--- a/dace/memlet.py
+++ b/dace/memlet.py
@@ -68,9 +68,9 @@ def __init__(self,
                  debuginfo: Optional[dtypes.DebugInfo] = None,
                  wcr_nonatomic: bool = False,
                  allow_oob: bool = False):
-        """ 
+        """
         Constructs a Memlet.
-        
+
         :param expr: A string expression of the this memlet, given as an ease
                      of use API. Must follow one of the following forms:
                      1. ``ARRAY``,
@@ -82,7 +82,7 @@ def __init__(self,
         :param subset: The subset to take from the data attached to the edge,
                        represented either as a string or a Subset object.
         :param other_subset: The subset to offset into the other side of the
-                             memlet, represented either as a string or a Subset 
+                             memlet, represented either as a string or a Subset
                              object.
         :param volume: The exact number of elements moved using this
                        memlet, or the maximum number of elements if
@@ -91,14 +91,14 @@ def __init__(self,
                        is runtime-defined and unbounded.
         :param dynamic: If True, the number of elements moved in this memlet
                         is defined dynamically at runtime.
-        :param wcr: A lambda function (represented as a string or Python AST) 
+        :param wcr: A lambda function (represented as a string or Python AST)
                     specifying how write-conflicts are resolved. The syntax
-                    of the lambda function receives two elements: ``current`` 
-                    value and `new` value, and returns the value after 
+                    of the lambda function receives two elements: ``current``
+                    value and `new` value, and returns the value after
                     resolution. For example, summation is represented by
                     ``'lambda cur, new: cur + new'``.
         :param debuginfo: Line information from the generating source code.
-        :param wcr_nonatomic: If True, overrides the automatic code generator 
+        :param wcr_nonatomic: If True, overrides the automatic code generator
                               decision and treat all write-conflict resolution
                               operations as non-atomic, which might cause race
                               conditions in the general case.
@@ -225,16 +225,16 @@ def __deepcopy__(self, memo):
         return node
 
     def is_empty(self) -> bool:
-        """ 
+        """
         Returns True if this memlet carries no data. Memlets without data are
-        primarily used for connecting nodes to scopes without transferring 
-        data to them. 
+        primarily used for connecting nodes to scopes without transferring
+        data to them.
         """
         return (self.data is None and self.subset is None and self.other_subset is None)
 
     @property
     def num_accesses(self):
-        """ 
+        """
         Returns the total memory movement volume (in elements) of this memlet.
         """
         return self.volume
@@ -255,7 +255,7 @@ def simple(data,
         """
         DEPRECATED: Constructs a Memlet from string-based expressions.
 
-        :param data: The data object or name to access. 
+        :param data: The data object or name to access.
         :param subset_str: The subset of `data` that is going to
                             be accessed in string format. Example: '0:N'.
         :param wcr_str: A lambda function (as a string) specifying
@@ -335,7 +335,7 @@ def _parse_from_subexpr(self, expr: str):
         # [subset] syntax
         if expr.startswith('['):
             return None, SubsetProperty.from_string(expr[1:-1])
-      
+
         # array[subset] syntax
         arrname, subset_str = expr[:-1].split('[')
         if not dtypes.validate_name(arrname):
@@ -385,8 +385,8 @@ def _parse_memlet_from_str(self, expr: str):
 
     def try_initialize(self, sdfg: 'dace.sdfg.SDFG', state: 'dace.sdfg.SDFGState',
                        edge: 'dace.sdfg.graph.MultiConnectorEdge'):
-        """ 
-        Tries to initialize the internal fields of the memlet (e.g., src/dst 
+        """
+        Tries to initialize the internal fields of the memlet (e.g., src/dst
         subset) once it is added to an SDFG as an edge.
         """
         from dace.sdfg.nodes import AccessNode, CodeNode  # Avoid import loops
@@ -435,7 +435,7 @@ def get_dst_subset(self, edge: 'dace.sdfg.graph.MultiConnectorEdge', state: 'dac
 
     @staticmethod
     def from_array(dataname, datadesc, wcr=None):
-        """ 
+        """
         Constructs a Memlet that transfers an entire array's contents.
 
         :param dataname: The name of the data descriptor in the SDFG.
@@ -456,7 +456,7 @@ def __eq__(self, other):
     def replace(self, repl_dict):
         """
         Substitute a given set of symbols with a different set of symbols.
-        
+
         :param repl_dict: A dict of string symbol names to symbols with
                           which to replace them.
         """
@@ -538,8 +538,8 @@ def validate(self, sdfg, state):
 
     def used_symbols(self, all_symbols: bool, edge=None) -> Set[str]:
         """
-        Returns a set of symbols used in this edge's properties. 
-        
+        Returns a set of symbols used in this edge's properties.
+
         :param all_symbols: If False, only returns the set of symbols that will be used
                             in the generated code and are needed as arguments.
         :param edge: If given, provides richer context-based tests for the case
@@ -606,7 +606,7 @@ def get_free_symbols_by_indices(self, indices_src: List[int], indices_dst: List[
 
     def get_stride(self, sdfg: 'dace.sdfg.SDFG', map: 'dace.sdfg.nodes.Map', dim: int = -1) -> 'dace.symbolic.SymExpr':
         """ Returns the stride of the underlying memory when traversing a Map.
-            
+
             :param sdfg: The SDFG in which the memlet resides.
             :param map: The map in which the memlet resides.
             :param dim: The dimension that is incremented. By default it is the innermost.
diff --git a/dace/sdfg/infer_types.py b/dace/sdfg/infer_types.py
index c05708670e..6219784cfe 100644
--- a/dace/sdfg/infer_types.py
+++ b/dace/sdfg/infer_types.py
@@ -52,9 +52,9 @@ def infer_out_connector_type(sdfg: SDFG, state: SDFGState, node: nodes.CodeNode,
 
 
 def infer_connector_types(sdfg: SDFG):
-    """ 
+    """
     Infers connector types throughout an SDFG and its nested SDFGs in-place.
-    
+
     :param sdfg: The SDFG to infer.
     """
     # Loop over states, and in a topological sort over each state's nodes
@@ -125,13 +125,13 @@ def set_default_schedule_and_storage_types(scope: Union[SDFG, SDFGState, nodes.E
                                            use_parent_schedule: bool = False,
                                            state: SDFGState = None,
                                            child_nodes: Dict[nodes.Node, List[nodes.Node]] = None):
-    """ 
+    """
     Sets default storage and schedule types throughout SDFG in-place.
     Replaces ``ScheduleType.Default`` and ``StorageType.Default``
-    with the corresponding types according to the parent scope's schedule. 
-    
+    with the corresponding types according to the parent scope's schedule.
+
     The defaults for storage types are determined by the
-    ``dtypes.SCOPEDEFAULT_STORAGE`` dictionary (for example, a GPU device 
+    ``dtypes.SCOPEDEFAULT_STORAGE`` dictionary (for example, a GPU device
     schedule, by default, will allocate containers on the shared memory).
     Following storage type inference for a scope, nested scopes (e.g., map entry, nested SDFG)
     are evaluated using the ``dtypes.STORAGEDEFAULT_SCHEDULE`` dictionary (for example, a
diff --git a/dace/sdfg/replace.py b/dace/sdfg/replace.py
index e3bea0b807..e34c6228e3 100644
--- a/dace/sdfg/replace.py
+++ b/dace/sdfg/replace.py
@@ -54,7 +54,7 @@ def _replsym(symlist, symrepl):
 def replace_dict(subgraph: 'StateSubgraphView',
                  repl: Dict[str, str],
                  symrepl: Optional[Dict[symbolic.SymbolicType, symbolic.SymbolicType]] = None):
-    """ 
+    """
     Finds and replaces all occurrences of a set of symbols/arrays in the given subgraph.
 
     :param subgraph: The given graph or subgraph to replace in.
@@ -86,7 +86,7 @@ def replace_dict(subgraph: 'StateSubgraphView',
 def replace(subgraph: 'StateSubgraphView', name: str, new_name: str):
     """
     Finds and replaces all occurrences of a symbol or array in the given subgraph.
-    
+
     :param subgraph: The given graph or subgraph to replace in.
     :param name: Name to find.
     :param new_name: Name to replace.
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 8d8ea82484..eaa0717c86 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -101,8 +101,8 @@ def _nested_arrays_from_json(obj, context=None):
     return NestedDict({k: dace.serialize.from_json(v, context) for k, v in obj.items()})
 
 
-def _replace_dict_keys(d, old, new):
-    if old in d:
+def _replace_dict_keys(d, old, new, filter=None):
+    if old in d and (filter is None or old in filter):
         if new in d:
             warnings.warn('"%s" already exists in SDFG' % new)
         d[new] = d[old]
@@ -418,7 +418,7 @@ class SDFG(ControlFlowRegion):
                        desc="Data descriptors for this SDFG",
                        to_json=_arrays_to_json,
                        from_json=_nested_arrays_from_json)
-    _size_arrays = Property(dtype=NestedDict,
+    _arrays = Property(dtype=NestedDict,
                        desc="Data size descriptors for this SDFG",
                        to_json=_arrays_to_json,
                        from_json=_nested_arrays_from_json)
@@ -500,7 +500,7 @@ def __init__(self,
         self._parent_sdfg = None
         self._parent_nsdfg_node = None
         self._arrays = NestedDict()  # type: Dict[str, dt.Array]
-        self._size_arrays = NestedDict()
+        self._arrays = NestedDict()
         self.arg_names = []
         self._labels: Set[str] = set()
         self.global_code = {'frame': CodeBlock("", dtypes.Language.CPP)}
@@ -689,8 +689,8 @@ def arrays(self):
         return self._arrays
 
     @property
-    def size_arrays(self):
-        return self._size_arrays
+    def arrays(self):
+        return self._arrays
 
     @property
     def process_grids(self):
@@ -749,18 +749,31 @@ def replace_dict(self,
         }
 
         # Replace in arrays and symbols (if a variable name)
+        size_arrays =  {v.size_desc_name for v in self.arrays.values()
+                        if v.size_desc_name is not None and v.size_desc_name in self.arrays}
+        non_size_arrays = {k for k in self.arrays if k not in size_arrays}
+        size_desc_map = dict()
+
         if replace_keys:
             # Filter out nested data names, as we cannot and do not want to replace names in nested data descriptors
             repldict_filtered = {k: v for k, v in repldict.items() if '.' not in k}
             for name, new_name in repldict_filtered.items():
                 if validate_name(new_name):
-                    _replace_dict_keys(self._arrays, name, new_name)
-                    _replace_dict_keys(self._size_arrays, name + "_size", new_name + "_size")
+                    _replace_dict_keys(self.arrays, name, new_name, non_size_arrays)
+                    if new_name != "__return":
+                        size_desc_map[new_name] = new_name + "_size"
+                        _replace_dict_keys(self.arrays, name + "_size", new_name + "_size", size_arrays)
                     _replace_dict_keys(self.symbols, name, new_name)
                     _replace_dict_keys(self.constants_prop, name, new_name)
                     _replace_dict_keys(self.callback_mapping, name, new_name)
                     _replace_dict_values(self.callback_mapping, name, new_name)
 
+        # Update size descriptors
+        # Return_size break things delete it from the arrays
+        for arr_name, size_desc_name in size_desc_map.items():
+            arr = self.arrays[arr_name]
+            arr.size_desc_name = size_desc_name if size_desc_name != "__return_size" else None
+
         # Replace inside data descriptors
         for array in self.arrays.values():
             replace_properties_dict(array, repldict, symrepl)
@@ -1162,9 +1175,11 @@ def remove_data(self, name, validate=True):
                                          f"{node} in state {state}.")
 
         size_desc_name = self._arrays[name].size_desc_name
+        # If unused it might have been removed by optimization
+        if size_desc_name is not None and size_desc_name in self._arrays:
+            del self._arrays[size_desc_name]
         del self._arrays[name]
-        if size_desc_name is not None:
-            del self._size_arrays[size_desc_name]
+
 
     def reset_sdfg_list(self):
         """
@@ -1689,14 +1704,14 @@ def _find_new_name(self, name: str):
         """ Tries to find a new name by adding an underscore and a number. """
 
         names = (self._arrays.keys() | self.constants_prop.keys() | self._pgrids.keys() | self._subarrays.keys()
-                 | self._rdistrarrays.keys() | self.symbols.keys() | self._size_arrays.keys())
+                 | self._rdistrarrays.keys() | self.symbols.keys() | self._arrays.keys())
         return dt.find_new_name(name, names)
 
     def is_name_used(self, name: str) -> bool:
         """ Checks if `name` is already used inside the SDFG."""
         if name in self._arrays:
             return True
-        if name in self._size_arrays:
+        if name in self._arrays:
             return True
         if name in self.symbols:
             return True
@@ -1768,22 +1783,6 @@ def add_array(self,
         if isinstance(dtype, type) and dtype in dtypes._CONSTANT_TYPES[:-1]:
             dtype = dtypes.typeclass(dtype)
 
-        if transient:
-            size_desc = dt.Array(dtype=dace.uint64,
-                                shape=(len(shape),),
-                                storage=dtypes.StorageType.Default,
-                                location=None,
-                                allow_conflicts=False,
-                                transient=True,
-                                strides=(1,),
-                                offset=(0,),
-                                lifetime=lifetime,
-                                alignment=alignment,
-                                debuginfo=debuginfo,
-                                total_size=len(shape),
-                                may_alias=False,
-                                size_desc_name=None)
-
         desc = dt.Array(dtype=dtype,
                         shape=shape,
                         storage=storage,
@@ -1800,12 +1799,6 @@ def add_array(self,
                         size_desc_name=None)
 
         array_name = self.add_datadesc(name, desc, find_new_name=find_new_name)
-        if transient:
-            size_desc_name = f"{array_name}_size"
-            self.add_size_datadesc(size_desc_name, size_desc)
-            # In case find_new_name and a new name is returned
-            # we need to update the size descriptor name of the array
-            desc.size_desc_name = size_desc_name
         return array_name, desc
 
     def add_view(self,
@@ -2053,15 +2046,14 @@ def add_temp_transient_like(self, desc: Union[dt.Array, dt.Scalar], dtype=None,
         newdesc.debuginfo = debuginfo
         return self.add_datadesc(self.temp_data_name(), newdesc), newdesc
 
-    @staticmethod
-    def _add_symbols(sdfg, desc: dt.Data):
+    def _add_symbols(self, desc: dt.Data):
         if isinstance(desc, dt.Structure):
             for v in desc.members.values():
                 if isinstance(v, dt.Data):
-                    SDFG._add_symbols(sdfg, v)
+                    self._add_symbols(v)
         for sym in desc.free_symbols:
-            if sym.name not in sdfg.symbols:
-                sdfg.add_symbol(sym.name, sym.dtype)
+            if sym.name not in self.symbols:
+                self.add_symbol(sym.name, sym.dtype)
 
     def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str:
         """ Adds an existing data descriptor to the SDFG array store.
@@ -2092,7 +2084,7 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
         else:
             # We do not check for data constant, because there is a link between the constants and
             #  the data descriptors.
-            if name in self.arrays or name in self.size_arrays:
+            if name in self.arrays or name in self.arrays:
                 raise FileExistsError(f'Data descriptor "{name}" already exists in SDFG')
             if name in self.symbols:
                 raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a symbol.')
@@ -2105,36 +2097,34 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
 
         # Add the data descriptor to the SDFG and all symbols that are not yet known.
         self._arrays[name] = datadesc
-        SDFG._add_symbols(self, datadesc)
-
-        return name
-
-    def add_size_datadesc(self, name: str, datadesc: dt.Data) -> str:
-        """ Adds an existing data descriptor to the SDFG array store.
-
-            :param name: Name to use.
-            :param datadesc: Data descriptor to add.
-            :param find_new_name: If True and data descriptor with this name
-                                  exists, finds a new name to add.
-            :return: Name of the new data descriptor
-        """
-        if not isinstance(name, str):
-            raise TypeError("Data descriptor name must be a string. Got %s" % type(name).__name__)
-
-        if name in self.arrays or name in self.size_arrays:
-            raise FileExistsError(f'Data descriptor "{name}" already exists in SDFG')
-        if name in self.symbols:
-            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a symbol.')
-        if name in self._subarrays:
-            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a subarray.')
-        if name in self._rdistrarrays:
-            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a RedistrArray.')
-        if name in self._pgrids:
-            raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a ProcessGrid.')
-
-        # Add the data descriptor to the SDFG and all symbols that are not yet known.
-        self._size_arrays[name] = datadesc
-        SDFG._add_symbols(self, datadesc)
+        self._add_symbols(datadesc)
+
+        if (
+            datadesc.transient is True and
+            isinstance(datadesc, dt.Array) and
+            name != "__return"
+            ):
+            size_desc_name = f"{name}_size"
+            size_desc = dt.Array(dtype=dace.uint64,
+                                shape=(len(datadesc.shape),),
+                                storage=dtypes.StorageType.Default,
+                                location=None,
+                                allow_conflicts=False,
+                                transient=True,
+                                strides=(1,),
+                                offset=(0,),
+                                lifetime=datadesc.lifetime,
+                                alignment=datadesc.alignment,
+                                debuginfo=datadesc.debuginfo,
+                                total_size=len(datadesc.shape),
+                                may_alias=False,
+                                size_desc_name=None)
+            self._arrays[size_desc_name] = size_desc
+            # In case find_new_name and a new name is returned
+            # we need to update the size descriptor name of the array
+            datadesc.size_desc_name = size_desc_name
+            self._add_symbols(size_desc)
+            print(self._arrays)
 
         return name
 
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 5212147c03..73d4913630 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -325,7 +325,7 @@ def _accessible(sdfg: 'dace.sdfg.SDFG', container: str, context: Dict[str, bool]
     """
     Helper function that returns False if a data container cannot be accessed in the current SDFG context.
     """
-    storage = sdfg.arrays[container].storage
+    storage = sdfg.arrays[container].storage if container in sdfg.arrays else sdfg.arrays[container].storage
     if storage == dtypes.StorageType.GPU_Global or storage in dtypes.GPU_STORAGES:
         return context.get('in_gpu', False)
     if storage == dtypes.StorageType.FPGA_Global or storage in dtypes.FPGA_STORAGES:
@@ -901,10 +901,11 @@ def validate_state(state: 'dace.sdfg.SDFGState',
 
         # Check dimensionality of memory access
         if isinstance(e.data.subset, (sbs.Range, sbs.Indices)):
-            if e.data.subset.dims() != len(sdfg.arrays[e.data.data].shape):
+            desc = sdfg.arrays[e.data.data] if e.data.data in sdfg.arrays else sdfg.arrays[e.data.data]
+            if e.data.subset.dims() != len(desc.shape):
                 raise InvalidSDFGEdgeError(
                     "Memlet subset uses the wrong dimensions"
-                    " (%dD for a %dD data node)" % (e.data.subset.dims(), len(sdfg.arrays[e.data.data].shape)),
+                    " (%dD for a %dD data node)" % (e.data.subset.dims(), len(desc.shape)),
                     sdfg,
                     state_id,
                     eid,
@@ -913,8 +914,8 @@ def validate_state(state: 'dace.sdfg.SDFGState',
         # Verify that source and destination subsets contain the same
         # number of elements
         if not e.data.allow_oob and e.data.other_subset is not None and not (
-            (isinstance(src_node, nd.AccessNode) and isinstance(sdfg.arrays[src_node.data], dt.Stream)) or
-            (isinstance(dst_node, nd.AccessNode) and isinstance(sdfg.arrays[dst_node.data], dt.Stream))):
+            (isinstance(src_node, nd.AccessNode) and src_node.data in sdfg.arrays and isinstance(sdfg.arrays[src_node.data], dt.Stream)) or
+            (isinstance(dst_node, nd.AccessNode) and src_node.data in sdfg.arrays and isinstance(sdfg.arrays[dst_node.data], dt.Stream))):
             src_expr = (e.data.src_subset.num_elements() * sdfg.arrays[src_node.data].veclen)
             dst_expr = (e.data.dst_subset.num_elements() * sdfg.arrays[dst_node.data].veclen)
             if symbolic.inequal_symbols(src_expr, dst_expr):
diff --git a/dace/transformation/dataflow/redundant_array.py b/dace/transformation/dataflow/redundant_array.py
index 5e5072ff32..ebccf93047 100644
--- a/dace/transformation/dataflow/redundant_array.py
+++ b/dace/transformation/dataflow/redundant_array.py
@@ -1675,7 +1675,7 @@ def _offset_subset(self, mapping: Dict[int, int], subset: subsets.Range, edge_su
 
 class RemoveIntermediateWrite(pm.SingleStateTransformation):
     """ Moves intermediate writes insde a Map's subgraph outside the Map.
-    
+
     Currently, the transformation supports only the case `WriteAccess -> MapExit`, where the edge has an empty Memlet.
     """
 
diff --git a/dace/transformation/passes/array_elimination.py b/dace/transformation/passes/array_elimination.py
index 46411478d5..f579180ff2 100644
--- a/dace/transformation/passes/array_elimination.py
+++ b/dace/transformation/passes/array_elimination.py
@@ -34,7 +34,7 @@ def depends_on(self):
     def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[Set[str]]:
         """
         Removes redundant arrays and access nodes.
-        
+
         :param sdfg: The SDFG to modify.
         :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass
                                  results as ``{Pass subclass name: returned object from pass}``. If not run in a
@@ -84,7 +84,12 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[S
                 result.update({n.data for n in removed_nodes})
 
         # If node is completely removed from graph, erase data descriptor
-        for aname, desc in list(sdfg.arrays.items()):
+        array_items = list(sdfg.arrays.items())
+        size_descriptors = set([v.size_desc_name for v in sdfg.arrays.values() if v.size_desc_name is not None])
+        for aname, desc in array_items:
+            # Remove size descriptors only if the original array is removed
+            if  aname in size_descriptors:
+                continue
             if not desc.transient or isinstance(desc, data.Scalar):
                 continue
             if aname not in access_sets or not access_sets[aname]:
@@ -92,7 +97,10 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[S
                 if isinstance(desc, data.Structure) and len(desc.members) > 0:
                     continue
                 sdfg.remove_data(aname, validate=False)
+                if desc.size_desc_name is not None:
+                    sdfg.remove_data(desc.size_desc_name, validate=False)
                 result.add(aname)
+                result.add(desc.size_desc_name)
 
         return result or None
 
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 6459ee6105..35b6c6c16b 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -1,4 +1,6 @@
 import dace
+from dace.transformation.dataflow.redundant_array import RedundantArray, RedundantSecondArray
+from dace.transformation.interstate.state_fusion import StateFusion
 import numpy
 import cupy
 import pytest
@@ -28,7 +30,8 @@ def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bo
     an_1 = state.add_access('A')
     an_1.add_in_connector('_write_size')
 
-    an_2 = state.add_array(name="user_size", shape=(2,), dtype=dace.uint64)
+    sdfg.add_array(name="user_size", shape=(2,), dtype=dace.uint64)
+    an_2 = state.add_access("user_size")
 
     state.add_edge(an_2, None, an_1, '_write_size',
                 dace.Memlet(expr=f"user_size[{write_size}]") )
@@ -48,7 +51,8 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool,
     an_1.add_in_connector('_write_size')
     an_1.add_out_connector('_read_size')
 
-    an_2 = state.add_array(name="user_size", shape=(2,), dtype=dace.uint64)
+    sdfg.add_array(name="user_size", shape=(2,), dtype=dace.uint64)
+    an_2 = state.add_access("user_size")
 
     state.add_edge(an_2, None, an_1, '_write_size',
                 dace.Memlet(expr="user_size[0:2]") )
@@ -116,6 +120,11 @@ def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool)
 
     sdfg.compile()
 
+    sdfg.simplify()
+    sdfg.apply_transformations_repeated([StateFusion, RedundantArray, RedundantSecondArray])
+    sdfg.validate()
+    sdfg.compile()
+
 def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType):
     sdfg = _get_assign_map_sdfg(storage_type, transient, schedule_type)
     try:
@@ -133,14 +142,28 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch
     if storage_type == dace.dtypes.StorageType.CPU_Heap:
         arr = numpy.array([-1.0]).astype(numpy.float32)
         user_size = numpy.array([10, 10]).astype(numpy.uint64)
-        compiled_sdfg (user_size=user_size, example_array=arr)
+        compiled_sdfg(user_size=user_size, example_array=arr)
         assert ( arr[0] == 3.0 )
     if storage_type == dace.dtypes.StorageType.GPU_Global:
         arr = cupy.array([-1.0]).astype(cupy.float32)
         user_size = numpy.array([10, 10]).astype(numpy.uint64)
-        compiled_sdfg (user_size=user_size, example_array=arr)
+        compiled_sdfg(user_size=user_size, example_array=arr)
         assert ( arr.get()[0] == 3.0 )
 
+    sdfg.simplify()
+    sdfg.apply_transformations_repeated([StateFusion, RedundantArray, RedundantSecondArray])
+    sdfg.validate()
+    compiled_sdfg = sdfg.compile()
+    if storage_type == dace.dtypes.StorageType.CPU_Heap:
+        arr = numpy.array([-1.0]).astype(numpy.float32)
+        user_size = numpy.array([10, 10]).astype(numpy.uint64)
+        compiled_sdfg(user_size=user_size, example_array=arr)
+        assert ( arr[0] == 3.0 )
+    if storage_type == dace.dtypes.StorageType.GPU_Global:
+        arr = cupy.array([-1.0]).astype(cupy.float32)
+        user_size = numpy.array([10, 10]).astype(numpy.uint64)
+        compiled_sdfg(user_size=user_size, example_array=arr)
+        assert ( arr.get()[0] == 3.0 )
 
 def test_realloc_inside_map():
     pass

From fe3748e6e825c48fd4992a07ae90e631bc99b2c2 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 16:21:16 +0100
Subject: [PATCH 20/51] Rm rogue pritn

---
 dace/sdfg/sdfg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index eaa0717c86..d3866cd07e 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2124,7 +2124,6 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
             # we need to update the size descriptor name of the array
             datadesc.size_desc_name = size_desc_name
             self._add_symbols(size_desc)
-            print(self._arrays)
 
         return name
 

From 1164e8c4d1d36f03a7dcd5b905fc6b623c67e852 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 16:29:54 +0100
Subject: [PATCH 21/51] Rm array length checks for now

---
 tests/sdfg/cutout_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/sdfg/cutout_test.py b/tests/sdfg/cutout_test.py
index 151c3cab47..3ce4db5ac8 100644
--- a/tests/sdfg/cutout_test.py
+++ b/tests/sdfg/cutout_test.py
@@ -21,7 +21,6 @@ def simple_matmul(A: dace.float64[20, 20], B: dace.float64[20, 20]):
     cut_sdfg = SDFGCutout.singlestate_cutout(state, node)
     assert cut_sdfg.number_of_nodes() == 1
     assert cut_sdfg.node(0).number_of_nodes() == 4
-    assert len(cut_sdfg.arrays) == 3
     assert all(not a.transient for a in cut_sdfg.arrays.values())
 
 
@@ -42,7 +41,6 @@ def simple_matmul(A: dace.float64[20, 20], B: dace.float64[20, 20]):
     cut_sdfg = SDFGCutout.singlestate_cutout(state, *nodes)
     assert cut_sdfg.number_of_nodes() == 1
     assert cut_sdfg.node(0).number_of_nodes() == 7
-    assert len(cut_sdfg.arrays) == 5
     assert (not any(a.transient for a in cut_sdfg.arrays.values()))
 
 
@@ -309,7 +307,6 @@ def test_input_output_configuration():
     assert ct.arrays['tmp2'].transient == False
     assert ct.arrays['tmp3'].transient == True
     assert ct.arrays['tmp4'].transient == True
-    assert len(ct.arrays) == 4
 
 
 def test_minimum_cut_simple_no_further_input_config():

From c597e24fbff9304188518dd2d15c7fe79cc019ef Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 16:42:57 +0100
Subject: [PATCH 22/51] Naming fixes

---
 dace/frontend/python/newast.py | 2 +-
 dace/sdfg/sdfg.py              | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index 255bd26983..bab8ffa0ab 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -3970,7 +3970,7 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
         # Change transient names
         arrays_before = list(sdfg.arrays.items())
         for arrname, array in arrays_before:
-            if array.transient and arrname[:5] == '__tmp':
+            if array.transient and arrname[:5] == '__tmp' and arrname not in sdfg.size_arrays():
                 if int(arrname[5:]) < self.sdfg._temp_transients:
                     if self.sdfg._temp_transients > sdfg._temp_transients:
                         new_name = self.sdfg.temp_data_name()
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index d3866cd07e..bfa85dc6ed 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -688,6 +688,9 @@ def arrays(self):
         """
         return self._arrays
 
+    def size_arrays(self):
+        return [v.size_desc_name for v in self._arrays.values() if v.size_desc_name is not None and v.size_desc_name in self._arrays]
+
     @property
     def arrays(self):
         return self._arrays

From 21dd0c3125c173f85593258737ee8108d7af85a4 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 3 Dec 2024 16:54:28 +0100
Subject: [PATCH 23/51] Name update fixes

---
 dace/sdfg/sdfg.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index bfa85dc6ed..488bbcae72 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -773,9 +773,11 @@ def replace_dict(self,
 
         # Update size descriptors
         # Return_size break things delete it from the arrays
+        # Changes symbol names might not related to arrays
         for arr_name, size_desc_name in size_desc_map.items():
-            arr = self.arrays[arr_name]
-            arr.size_desc_name = size_desc_name if size_desc_name != "__return_size" else None
+            arr = self.arrays[arr_name] if arr_name in self.arrays else None
+            if arr is not None:
+                arr.size_desc_name = size_desc_name if size_desc_name != "__return_size" else None
 
         # Replace inside data descriptors
         for array in self.arrays.values():

From e669f7cb2e66eef3c3134dc1429d5410bb96a35c Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 11:30:42 +0100
Subject: [PATCH 24/51] Various bugfixes on the feature

---
 dace/codegen/targets/cpu.py       | 11 -------
 dace/codegen/targets/cuda.py      | 48 ++++++++++++++-----------------
 dace/codegen/targets/framecode.py | 15 ++++++++++
 dace/sdfg/sdfg.py                 | 25 +++++++++-------
 tests/deferred_alloc_test.py      | 11 ++++++-
 5 files changed, 61 insertions(+), 49 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 6a2b89e4ae..85c51cb8b1 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -510,15 +510,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
             if not declared:
                 declaration_stream.write(f'{nodedesc.dtype.ctype} *{name};\n', cfg, state_id, node)
-                # Initialize size array
-                size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
-                if (nodedesc.transient and (
-                    nodedesc.storage == dtypes.StorageType.CPU_Heap or
-                    nodedesc.storage == dtypes.StorageType.GPU_Global)
-                    ):
-                    size_desc_name = nodedesc.size_desc_name
-                    size_nodedesc = sdfg.arrays[size_desc_name]
-                    declaration_stream.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n', cfg, state_id, node)
             if deferred_allocation:
                 allocation_stream.write(
                     "%s = nullptr; // Deferred Allocation" %
@@ -539,8 +530,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                 )
 
             define_var(name, DefinedType.Pointer, ctypedef)
-            if not declared:
-                define_var(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
 
             if node.setzero:
                 allocation_stream.write("memset(%s, 0, sizeof(%s)*%s);" %
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index bbc485c336..ba4fafa4d2 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -299,7 +299,9 @@ def _compute_pool_release(self, top_sdfg: SDFG):
 
                 # Add sink as terminator state
                 for arr in unfreed:
-                    self.pool_release[(sdfg, arr)] = (sink, set())
+                    if (sdfg.arrays[arr].storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared]
+                        and arr not in sdfg.size_arrays()): # Do put size arrays to pool release
+                        self.pool_release[(sdfg, arr)] = (sink, set())
 
     # Generate final code
     def get_generated_codeobjects(self):
@@ -578,6 +580,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
         # Check if array is already declared
         declared = False
+        size_declared = False
         try:
             self._dispatcher.declared_arrays.get(dataname)
             declared = True  # Array was already declared in this or upper scopes
@@ -608,17 +611,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
         # Different types of GPU arrays
         if nodedesc.storage == dtypes.StorageType.GPU_Global:
             if not declared:
-                result_decl.write('%s %s;\n' % (ctypedef, dataname))
-                size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in nodedesc.shape])
-                if nodedesc.transient:
-                    size_desc_name = nodedesc.size_desc_name
-                    if size_desc_name is not None:
-                        size_nodedesc = sdfg.arrays[size_desc_name]
-                        result_decl.write(f'{size_nodedesc.dtype.ctype} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n')
-                        self._dispatcher.defined_vars.add(size_desc_name, DefinedType.Pointer, size_nodedesc.dtype.ctype)
+                declaration_stream.write('%s %s;\n' % (ctypedef, dataname))
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
 
-
             if deferred_allocation:
                 result_alloc.write(
                     "%s = nullptr; // Deferred Allocation" %
@@ -765,7 +760,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap
 
         if nodedesc.storage == dtypes.StorageType.GPU_Global:
             if not nodedesc.pool:  # If pooled, will be freed somewhere else
-                callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));\n' % (self.backend, dataname), cfg, state_id, node)
+                callsite_stream.write('DACE_GPU_CHECK(%sFree(%s));//a1\n' % (self.backend, dataname), cfg, state_id, node)
         elif nodedesc.storage == dtypes.StorageType.CPU_Pinned:
             callsite_stream.write('DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node)
         elif nodedesc.storage == dtypes.StorageType.GPU_Shared or \
@@ -1286,8 +1281,7 @@ def generate_state(self,
                 ptrname = cpp.ptr(name, desc, sd, self._frame)
                 if isinstance(desc, dt.Array) and desc.start_offset != 0:
                     ptrname = f'({ptrname} - {cpp.sym2cpp(desc.start_offset)})'
-
-                callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', sd)
+                callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));//a2\n', sd)
                 self._emit_sync(callsite_stream)
                 to_remove.add((sd, name))
             for sd, name in to_remove:
@@ -1584,10 +1578,12 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             prototype_kernel_args[aname] = arg
 
             if aname in sdfg.arrays:
-                size_arr_name = data_desc.size_desc_name
-                if size_arr_name is not None:
-                    size_arr = sdfg.arrays[data_desc.size_desc_name]
-                    host_size_args[size_arr_name] = size_arr
+                arr = sdfg.arrays[aname]
+                if arr.transient and arr.storage == dtypes.StorageType.GPU_Global and arr.size_desc_name is not None:
+                    size_arr_name = data_desc.size_desc_name
+                    if size_arr_name is not None:
+                        size_arr = sdfg.arrays[data_desc.size_desc_name]
+                        host_size_args[size_arr_name] = size_arr
 
         kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k)
                              for k, v in prototype_kernel_args.items()]
@@ -1626,18 +1622,16 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         # Size arrays
         needed_size_scalars_declaration = []
         for size_desc_name, arg in host_size_args.items():
-            if isinstance(arg, dt.Array):
-                size_arr = arg
-                arr_name = size_desc_name.removesuffix("_size")
-                for i in range(size_arr.shape[0]):
-                    if f"__{arr_name}_dim{i}_size" not in dyn_args:
-                        dyn_args.append(f"__{arr_name}_dim{i}_size")
-                        dyn_args_typed.append(f"const {dace.uint64} __{arr_name}_dim{i}_size")
-                        needed_size_scalars_declaration.append(f"const {dace.uint64} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
+            arr_name = size_desc_name.removesuffix("_size")
+            for i in range(size_arr.shape[0]):
+                if f"__{arr_name}_dim{i}_size" not in dyn_args:
+                    dyn_args.append(f"__{arr_name}_dim{i}_size")
+                    dyn_args_typed.append(f"const {dace.uint64} __{arr_name}_dim{i}_size")
+                    needed_size_scalars_declaration.append(f"const {dace.uint64} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
 
         self._localcode.write(
             '__global__ void %s %s(%s) {\n' %
-            (launch_bounds, kernel_name, ', '.join(kernel_args_typed + extra_kernel_args_typed + dyn_args_typed)), sdfg, state_id, node)
+            (launch_bounds, kernel_name, ', '.join(kernel_args_typed + dyn_args_typed + extra_kernel_args_typed)), sdfg, state_id, node)
 
         # Write constant expressions in GPU code
         self._frame.generate_constants(sdfg, self._localcode)
@@ -1713,7 +1707,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             self._localcode.write(
                 memlet_definition,
                 cfg, state_id, scope_entry)
-        self._localcode.write("// Array sizes of arrays are passed to the kernel even if not used in maps")
+        self._localcode.write("// Array sizes of arrays used are passed here if needed to the kernel even")
         for decl in needed_size_scalars_declaration:
             self._localcode.write(decl, cfg, state_id, scope_entry)
 
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 427e8605d8..298ed1ebb4 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -834,6 +834,7 @@ def allocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Un
             self._dispatcher.dispatch_allocate(tsdfg, cfg if state is None else state.parent_graph, state, state_id,
                                                node, desc, function_stream, callsite_stream, declare, allocate)
 
+
     def deallocate_arrays_in_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, scope: Union[nodes.EntryNode, SDFGState,
                                                                                           SDFG],
                                    function_stream: CodeIOStream, callsite_stream: CodeIOStream):
@@ -907,6 +908,20 @@ def generate_code(self,
         global_symbols = copy.deepcopy(sdfg.symbols)
         global_symbols.update({aname: arr.dtype for aname, arr in sdfg.arrays.items()})
 
+        # Allocate size arrays (always check as name and array changes affect size descriptor names)
+        size_arrays = {(v.size_desc_name, sdfg.arrays[v.size_desc_name])
+                       for v in sdfg.arrays.values()
+                       if v.size_desc_name is not None and v.size_desc_name in sdfg.arrays}
+        callsite_stream.write(f'//Declare size arrays\n', sdfg)
+        for size_desc_name, size_nodedesc in size_arrays:
+            assert ("__return" not in size_desc_name)
+            ctypedef = size_nodedesc.dtype.ctype
+            from dace.codegen.targets import cpp
+            size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in size_nodedesc.shape])
+            alloc_str = f'{ctypedef} {size_desc_name}[{len(size_nodedesc.shape)}]{{{size_str}}};\n'
+            callsite_stream.write(alloc_str)
+            self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
+
         interstate_symbols = {}
         for cfr in sdfg.all_control_flow_regions():
             for e in cfr.dfs_edges(cfr.start_block):
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 488bbcae72..0f64641102 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -753,7 +753,8 @@ def replace_dict(self,
 
         # Replace in arrays and symbols (if a variable name)
         size_arrays =  {v.size_desc_name for v in self.arrays.values()
-                        if v.size_desc_name is not None and v.size_desc_name in self.arrays}
+                        if v.size_desc_name is not None and v.size_desc_name in self.arrays
+                        and v.transient}
         non_size_arrays = {k for k in self.arrays if k not in size_arrays}
         size_desc_map = dict()
 
@@ -763,7 +764,7 @@ def replace_dict(self,
             for name, new_name in repldict_filtered.items():
                 if validate_name(new_name):
                     _replace_dict_keys(self.arrays, name, new_name, non_size_arrays)
-                    if new_name != "__return":
+                    if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return
                         size_desc_map[new_name] = new_name + "_size"
                         _replace_dict_keys(self.arrays, name + "_size", new_name + "_size", size_arrays)
                     _replace_dict_keys(self.symbols, name, new_name)
@@ -772,12 +773,15 @@ def replace_dict(self,
                     _replace_dict_values(self.callback_mapping, name, new_name)
 
         # Update size descriptors
-        # Return_size break things delete it from the arrays
-        # Changes symbol names might not related to arrays
+        # Return_size break things (it is collected to the tuple of return) delete it from the arrays
+        # If this is called because array's properties had been changed then set the size desc to none
         for arr_name, size_desc_name in size_desc_map.items():
             arr = self.arrays[arr_name] if arr_name in self.arrays else None
             if arr is not None:
-                arr.size_desc_name = size_desc_name if size_desc_name != "__return_size" else None
+                if arr.transient and isinstance(arr, dt.Array):
+                    arr.size_desc_name = size_desc_name if "__return" not in new_name else None
+                else:
+                    arr.size_desc_name = None
 
         # Replace inside data descriptors
         for array in self.arrays.values():
@@ -1771,6 +1775,8 @@ def add_array(self,
 
         # Every Array also supports reallocation, we need to create a secondary size array
         # The array size is constant and not changeable, yet the values in the array can change
+        if name.endswith("_size"):
+            raise InvalidSDFGError("Array names are not allowed to end with _size")
 
         # convert strings to int if possible, unless it is not the reserved symbol for deferred allocation
         newshape = []
@@ -2103,16 +2109,15 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
         # Add the data descriptor to the SDFG and all symbols that are not yet known.
         self._arrays[name] = datadesc
         self._add_symbols(datadesc)
-
         if (
             datadesc.transient is True and
             isinstance(datadesc, dt.Array) and
-            name != "__return"
+            "__return" not in name
             ):
             size_desc_name = f"{name}_size"
             size_desc = dt.Array(dtype=dace.uint64,
                                 shape=(len(datadesc.shape),),
-                                storage=dtypes.StorageType.Default,
+                                storage=dtypes.StorageType.CPU_Heap,
                                 location=None,
                                 allow_conflicts=False,
                                 transient=True,
@@ -2468,8 +2473,8 @@ def argument_typecheck(self, args, kwargs, types_only=False):
 
         # Omit return values from arguments
         expected_args = collections.OrderedDict([(k, v) for k, v in expected_args.items()
-                                                 if not k.startswith('__return')])
-        kwargs = {k: v for k, v in kwargs.items() if not k.startswith('__return')}
+                                                 if '__return' not in k])
+        kwargs = {k: v for k, v in kwargs.items() if '__return' not in k}
 
         num_args_passed = len(args) + len(kwargs)
         num_args_expected = len(expected_args)
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 35b6c6c16b..1e90fad813 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -2,7 +2,6 @@
 from dace.transformation.dataflow.redundant_array import RedundantArray, RedundantSecondArray
 from dace.transformation.interstate.state_fusion import StateFusion
 import numpy
-import cupy
 import pytest
 
 @pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global])
@@ -145,6 +144,11 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch
         compiled_sdfg(user_size=user_size, example_array=arr)
         assert ( arr[0] == 3.0 )
     if storage_type == dace.dtypes.StorageType.GPU_Global:
+        try:
+            import cupy
+        except Exception:
+            return
+
         arr = cupy.array([-1.0]).astype(cupy.float32)
         user_size = numpy.array([10, 10]).astype(numpy.uint64)
         compiled_sdfg(user_size=user_size, example_array=arr)
@@ -160,6 +164,11 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch
         compiled_sdfg(user_size=user_size, example_array=arr)
         assert ( arr[0] == 3.0 )
     if storage_type == dace.dtypes.StorageType.GPU_Global:
+        try:
+            import cupy
+        except Exception:
+            return
+
         arr = cupy.array([-1.0]).astype(cupy.float32)
         user_size = numpy.array([10, 10]).astype(numpy.uint64)
         compiled_sdfg(user_size=user_size, example_array=arr)

From 6ac34f6862e2041775e4e1c8a20260176fe5b7a0 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 14:27:52 +0100
Subject: [PATCH 25/51] Add various fixes on distinguishing size and normal
 arrays

---
 dace/codegen/targets/framecode.py            |  7 +++----
 dace/data.py                                 |  5 +++++
 dace/frontend/python/newast.py               |  1 +
 dace/sdfg/sdfg.py                            | 22 ++++++++++++++------
 tests/transformations/redundant_copy_test.py |  6 +++---
 5 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 298ed1ebb4..c38cde8a85 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -909,11 +909,10 @@ def generate_code(self,
         global_symbols.update({aname: arr.dtype for aname, arr in sdfg.arrays.items()})
 
         # Allocate size arrays (always check as name and array changes affect size descriptor names)
-        size_arrays = {(v.size_desc_name, sdfg.arrays[v.size_desc_name])
-                       for v in sdfg.arrays.values()
-                       if v.size_desc_name is not None and v.size_desc_name in sdfg.arrays}
+        size_arrays = sdfg.size_arrays()
         callsite_stream.write(f'//Declare size arrays\n', sdfg)
-        for size_desc_name, size_nodedesc in size_arrays:
+        for size_desc_name in size_arrays:
+            size_nodedesc = sdfg.arrays[size_desc_name]
             assert ("__return" not in size_desc_name)
             ctypedef = size_nodedesc.dtype.ctype
             from dace.codegen.targets import cpp
diff --git a/dace/data.py b/dace/data.py
index 355532208b..f6c5a84417 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -1387,6 +1387,8 @@ class Array(Data):
                         'it is inferred by other properties and the OptionalArrayInference pass.')
     pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred')
 
+    is_size_array = Property(dtype=bool, default=False, desc='Special array that is used to track the size of an another array')
+
     def __init__(self,
                  dtype,
                  shape,
@@ -1412,6 +1414,9 @@ def __init__(self,
         self.may_alias = may_alias
         self.alignment = alignment
         self.size_desc_name = size_desc_name
+        self.is_size_array = False
+        if size_desc_name is not None:
+            assert self.is_size_array is False
 
         if start_offset is not None:
             self.start_offset = start_offset
diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index bab8ffa0ab..629f999bd8 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -3970,6 +3970,7 @@ def _parse_sdfg_call(self, funcname: str, func: Union[SDFG, SDFGConvertible], no
         # Change transient names
         arrays_before = list(sdfg.arrays.items())
         for arrname, array in arrays_before:
+            print(arrname, array, sdfg.arrays, sdfg.size_arrays())
             if array.transient and arrname[:5] == '__tmp' and arrname not in sdfg.size_arrays():
                 if int(arrname[5:]) < self.sdfg._temp_transients:
                     if self.sdfg._temp_transients > sdfg._temp_transients:
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 0f64641102..ea57a7e57c 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -689,7 +689,8 @@ def arrays(self):
         return self._arrays
 
     def size_arrays(self):
-        return [v.size_desc_name for v in self._arrays.values() if v.size_desc_name is not None and v.size_desc_name in self._arrays]
+        size_arrays = [k for k, v in self.arrays.items() if hasattr(v, "is_size_array") and v.is_size_array is True]
+        return size_arrays
 
     @property
     def arrays(self):
@@ -752,9 +753,7 @@ def replace_dict(self,
         }
 
         # Replace in arrays and symbols (if a variable name)
-        size_arrays =  {v.size_desc_name for v in self.arrays.values()
-                        if v.size_desc_name is not None and v.size_desc_name in self.arrays
-                        and v.transient}
+        size_arrays =  self.sdfg.size_arrays()
         non_size_arrays = {k for k in self.arrays if k not in size_arrays}
         size_desc_map = dict()
 
@@ -775,13 +774,19 @@ def replace_dict(self,
         # Update size descriptors
         # Return_size break things (it is collected to the tuple of return) delete it from the arrays
         # If this is called because array's properties had been changed then set the size desc to none
+        size_ararys_to_rm = set()
         for arr_name, size_desc_name in size_desc_map.items():
             arr = self.arrays[arr_name] if arr_name in self.arrays else None
             if arr is not None:
+                size_desc_name_before = arr.size_desc_name
                 if arr.transient and isinstance(arr, dt.Array):
                     arr.size_desc_name = size_desc_name if "__return" not in new_name else None
                 else:
                     arr.size_desc_name = None
+                if arr.size_desc_name is None and size_desc_name_before is not None:
+                    size_ararys_to_rm.add(size_desc_name_before)
+        for size_arr_name in size_ararys_to_rm:
+            del self.arrays[size_arr_name]
 
         # Replace inside data descriptors
         for array in self.arrays.values():
@@ -2112,9 +2117,13 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
         if (
             datadesc.transient is True and
             isinstance(datadesc, dt.Array) and
-            "__return" not in name
+            "__return" not in name and
+            datadesc.lifetime is not dtypes.AllocationLifetime.External and
+            datadesc.lifetime is not dtypes.AllocationLifetime.Persistent
             ):
             size_desc_name = f"{name}_size"
+            # Regardless it is allocated as a register array
+            # It is to prevent optimizations to putting them to FPGA/GPU storage
             size_desc = dt.Array(dtype=dace.uint64,
                                 shape=(len(datadesc.shape),),
                                 storage=dtypes.StorageType.CPU_Heap,
@@ -2123,12 +2132,13 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
                                 transient=True,
                                 strides=(1,),
                                 offset=(0,),
-                                lifetime=datadesc.lifetime,
+                                lifetime=dtypes.AllocationLifetime.SDFG,
                                 alignment=datadesc.alignment,
                                 debuginfo=datadesc.debuginfo,
                                 total_size=len(datadesc.shape),
                                 may_alias=False,
                                 size_desc_name=None)
+            size_desc.is_size_array = True
             self._arrays[size_desc_name] = size_desc
             # In case find_new_name and a new name is returned
             # we need to update the size descriptor name of the array
diff --git a/tests/transformations/redundant_copy_test.py b/tests/transformations/redundant_copy_test.py
index 2c753c6fc5..fdf71c9693 100644
--- a/tests/transformations/redundant_copy_test.py
+++ b/tests/transformations/redundant_copy_test.py
@@ -74,7 +74,7 @@ def make_sdfg() -> Tuple[dace.SDFG, dace.nodes.AccessNode, dace.nodes.AccessNode
         )
         sdfg.validate()
         assert state.number_of_nodes() == 4
-        assert len(sdfg.arrays) == 4
+        assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4
         return sdfg, a_an, b_an, output_an
 
     def apply_trafo(
@@ -89,7 +89,7 @@ def apply_trafo(
         candidate = {type(trafo).in_array: in_array, type(trafo).out_array: out_array}
         state = sdfg.start_block
         state_id = sdfg.node_id(state)
-        initial_arrays = len(sdfg.arrays)
+        initial_arrays = len(sdfg.arrays) - len(sdfg.size_arrays())
         initial_access_nodes = state.number_of_nodes()
 
         trafo.setup_match(sdfg, sdfg.cfg_id, state_id, candidate, 0, override=True)
@@ -101,7 +101,7 @@ def apply_trafo(
                 assert False, f"A view was created instead removing '{in_array.data}'."
             sdfg.validate()
             assert state.number_of_nodes() == initial_access_nodes - 1
-            assert len(sdfg.arrays) == initial_arrays - 1
+            assert len(sdfg.arrays) - len(sdfg.size_arrays()) == initial_arrays - 1
             assert in_array.data not in sdfg.arrays
             return sdfg
 

From 75e2739dfe3b84a46ec6531d33e6d91d41fd153b Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 14:43:49 +0100
Subject: [PATCH 26/51] Move size array name check to validation

---
 dace/sdfg/sdfg.py       | 15 +++++----------
 dace/sdfg/validation.py |  5 +++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index ea57a7e57c..0015e56458 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1776,13 +1776,6 @@ def add_array(self,
                   alignment=0,
                   may_alias=False) -> Tuple[str, dt.Array]:
         """ Adds an array to the SDFG data descriptor store. """
-
-
-        # Every Array also supports reallocation, we need to create a secondary size array
-        # The array size is constant and not changeable, yet the values in the array can change
-        if name.endswith("_size"):
-            raise InvalidSDFGError("Array names are not allowed to end with _size")
-
         # convert strings to int if possible, unless it is not the reserved symbol for deferred allocation
         newshape = []
         for i, s in enumerate(shape):
@@ -2122,8 +2115,10 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
             datadesc.lifetime is not dtypes.AllocationLifetime.Persistent
             ):
             size_desc_name = f"{name}_size"
-            # Regardless it is allocated as a register array
-            # It is to prevent optimizations to putting them to FPGA/GPU storage
+            # Regardless of the scope and storage it is allocated as a register array
+            # And at the start of the SDFG (or nested SDFG), not setting SDFG prevents to_gpu assertions
+            # from failing. To lifetime and storage are set explicitly to
+            # to prevent optimizations to putting them to FPGA/GPU storage
             size_desc = dt.Array(dtype=dace.uint64,
                                 shape=(len(datadesc.shape),),
                                 storage=dtypes.StorageType.CPU_Heap,
@@ -2132,7 +2127,7 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
                                 transient=True,
                                 strides=(1,),
                                 offset=(0,),
-                                lifetime=dtypes.AllocationLifetime.SDFG,
+                                lifetime=dtypes.AllocationLifetime.State,
                                 alignment=datadesc.alignment,
                                 debuginfo=datadesc.debuginfo,
                                 total_size=len(datadesc.shape),
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 73d4913630..2fbd95ae7e 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -259,6 +259,11 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                     'rather than using multiple references to the same one', sdfg, None)
             references.add(id(desc))
 
+            if name.endswith("_size") and not hasattr(desc, "is_size_array"):
+                raise InvalidSDFGEdgeError(
+                    f'Only size arrays allowed to end with _size'
+                )
+
             # Because of how the code generator works Scalars can not be return values.
             #  TODO: Remove this limitation as the CompiledSDFG contains logic for that.
             if isinstance(desc, dt.Scalar) and name.startswith("__return") and not desc.transient:

From 8c164a4071a0825364cd4240890f43c57717e9bb Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 14:56:43 +0100
Subject: [PATCH 27/51] Fix type shadowing in GPU kernel size array unpacking

---
 dace/codegen/targets/cuda.py      | 4 ++--
 dace/codegen/targets/framecode.py | 7 +++++--
 tests/deferred_alloc_test.py      | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index ba4fafa4d2..8efef5e9aa 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1626,8 +1626,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             for i in range(size_arr.shape[0]):
                 if f"__{arr_name}_dim{i}_size" not in dyn_args:
                     dyn_args.append(f"__{arr_name}_dim{i}_size")
-                    dyn_args_typed.append(f"const {dace.uint64} __{arr_name}_dim{i}_size")
-                    needed_size_scalars_declaration.append(f"const {dace.uint64} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
+                    dyn_args_typed.append(f"const {arg.dtype.ctype} __{arr_name}_dim{i}_size")
+                    needed_size_scalars_declaration.append(f"const {arg.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
 
         self._localcode.write(
             '__global__ void %s %s(%s) {\n' %
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index c38cde8a85..ac586317ff 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -916,8 +916,11 @@ def generate_code(self,
             assert ("__return" not in size_desc_name)
             ctypedef = size_nodedesc.dtype.ctype
             from dace.codegen.targets import cpp
-            size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in size_nodedesc.shape])
-            alloc_str = f'{ctypedef} {size_desc_name}[{len(size_nodedesc.shape)}]{{{size_str}}};\n'
+            array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
+            assert (len(array) == 1)
+            array = array[0]
+            size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape])
+            alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
             callsite_stream.write(alloc_str)
             self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
 
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 1e90fad813..2b4aa17717 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -39,7 +39,7 @@ def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bo
 
 
 def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType.Default):
-    sdfg = dace.sdfg.SDFG(name="deferred_alloc_test_4")
+    sdfg = dace.sdfg.SDFG(name="deferred_alloc_test_2")
 
     sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type,
                     lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient)

From e76f39db7bc9d00528b30fd2c78d372cd2a8e89e Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 15:07:51 +0100
Subject: [PATCH 28/51] Make tests consider size arrays (todo: maybe make
 arrays do not return size arrays) allocate size arrays only after symbols are
 allocated

---
 dace/codegen/targets/framecode.py      | 33 +++++++++++++-------------
 tests/passes/array_elimination_test.py |  2 +-
 tests/passes/lift_struct_views_test.py | 20 ++++++++--------
 tests/passes/scalar_fission_test.py    |  6 ++---
 4 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index ac586317ff..ad4fc524d4 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -908,22 +908,6 @@ def generate_code(self,
         global_symbols = copy.deepcopy(sdfg.symbols)
         global_symbols.update({aname: arr.dtype for aname, arr in sdfg.arrays.items()})
 
-        # Allocate size arrays (always check as name and array changes affect size descriptor names)
-        size_arrays = sdfg.size_arrays()
-        callsite_stream.write(f'//Declare size arrays\n', sdfg)
-        for size_desc_name in size_arrays:
-            size_nodedesc = sdfg.arrays[size_desc_name]
-            assert ("__return" not in size_desc_name)
-            ctypedef = size_nodedesc.dtype.ctype
-            from dace.codegen.targets import cpp
-            array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
-            assert (len(array) == 1)
-            array = array[0]
-            size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape])
-            alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
-            callsite_stream.write(alloc_str)
-            self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
-
         interstate_symbols = {}
         for cfr in sdfg.all_control_flow_regions():
             for e in cfr.dfs_edges(cfr.start_block):
@@ -975,6 +959,23 @@ def generate_code(self,
 
         callsite_stream.write('\n', sdfg)
 
+        # After the symbols
+        # Allocate size arrays (always check as name and array changes affect size descriptor names)
+        size_arrays = sdfg.size_arrays()
+        callsite_stream.write(f'//Declare size arrays\n', sdfg)
+        for size_desc_name in size_arrays:
+            size_nodedesc = sdfg.arrays[size_desc_name]
+            assert ("__return" not in size_desc_name)
+            ctypedef = size_nodedesc.dtype.ctype
+            from dace.codegen.targets import cpp
+            array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
+            assert (len(array) == 1)
+            array = array[0]
+            size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape])
+            alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
+            callsite_stream.write(alloc_str)
+            self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
+
         #######################################################################
         # Generate actual program body
 
diff --git a/tests/passes/array_elimination_test.py b/tests/passes/array_elimination_test.py
index f20428fda3..13d70cdead 100644
--- a/tests/passes/array_elimination_test.py
+++ b/tests/passes/array_elimination_test.py
@@ -30,7 +30,7 @@ def tester(A: dace.float64[20], B: dace.float64[20]):
     sdutil.inline_sdfgs(sdfg)
     sdutil.fuse_states(sdfg)
     Pipeline([ArrayElimination()]).apply_pass(sdfg, {})
-    assert len(sdfg.arrays) == 4
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4
 
 
 def test_merge_simple():
diff --git a/tests/passes/lift_struct_views_test.py b/tests/passes/lift_struct_views_test.py
index 71f19215b5..81a05d5f5c 100644
--- a/tests/passes/lift_struct_views_test.py
+++ b/tests/passes/lift_struct_views_test.py
@@ -25,14 +25,14 @@ def test_simple_tasklet_access():
     state.add_edge(t1, 'o1', write, None, dace.Memlet('Z'))
 
     assert len(state.nodes()) == 3
-    assert len(sdfg.arrays) == 2
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2
 
     res = LiftStructViews().apply_pass(sdfg, {})
 
     assert len(res['A']) == 1
     assert len(res['Z']) == 1
     assert len(state.nodes()) == 5
-    assert len(sdfg.arrays) == 4
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4
     assert sdfg.is_valid()
 
 
@@ -55,14 +55,14 @@ def test_sliced_tasklet_access():
     state.add_edge(t1, 'o1', write, None, dace.Memlet('Z'))
 
     assert len(state.nodes()) == 3
-    assert len(sdfg.arrays) == 2
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2
 
     res = LiftStructViews().apply_pass(sdfg, {})
 
     assert len(res['A']) == 1
     assert len(res['Z']) == 1
     assert len(state.nodes()) == 5
-    assert len(sdfg.arrays) == 4
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 4
     assert sdfg.is_valid()
 
 
@@ -91,12 +91,12 @@ def test_sliced_multi_tasklet_access():
     state.add_edge(t1, 'o1', write, None, dace.Memlet('Z'))
 
     assert len(state.nodes()) == 3
-    assert len(sdfg.arrays) == 2
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2
 
     FixedPointPipeline([LiftStructViews()]).apply_pass(sdfg, {})
 
     assert len(state.nodes()) == 9
-    assert len(sdfg.arrays) == 8
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 8
     assert sdfg.is_valid()
 
 
@@ -121,12 +121,12 @@ def test_tasklet_access_to_cont_array():
     state.add_edge(t1, 'o1', write, None, dace.Memlet('Z[0]'))
 
     assert len(state.nodes()) == 3
-    assert len(sdfg.arrays) == 2
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2
 
     FixedPointPipeline([LiftStructViews()]).apply_pass(sdfg, {})
 
     assert len(state.nodes()) == 7
-    assert len(sdfg.arrays) == 6
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 6
     assert sdfg.is_valid()
 
 
@@ -157,12 +157,12 @@ def test_sliced_multi_tasklet_access_to_cont_array():
     state.add_edge(t1, 'o1', write, None, dace.Memlet('Z[0]'))
 
     assert len(state.nodes()) == 3
-    assert len(sdfg.arrays) == 2
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 2
 
     FixedPointPipeline([LiftStructViews()]).apply_pass(sdfg, {})
 
     assert len(state.nodes()) == 11
-    assert len(sdfg.arrays) == 10
+    assert len(sdfg.arrays) - len(sdfg.size_arrays()) == 10
     assert sdfg.is_valid()
 
 
diff --git a/tests/passes/scalar_fission_test.py b/tests/passes/scalar_fission_test.py
index adf66f5b1d..b53e9b9d3a 100644
--- a/tests/passes/scalar_fission_test.py
+++ b/tests/passes/scalar_fission_test.py
@@ -102,7 +102,7 @@ def test_scalar_fission():
     # Both interstate edges should be different now.
     assert tmp1_edge.assignments != tmp2_edge.assignments
     # There should now be 5 arrays in the SDFG, i.e. 2 more than before since two isolated scopes of tmp exist.
-    assert len(sdfg.arrays.keys()) == 5
+    assert len(sdfg.arrays.keys()) - len(sdfg.size_arrays()) == 5
     # Assert all accesses per scope are identical.
     assert all([n.data == list(tmp1_edge.assignments.values())[0] for n in [tmp1_write, loop1_read_tmp]])
     assert all([n.data == list(tmp2_edge.assignments.values())[0] for n in [tmp2_write, loop2_read_tmp]])
@@ -187,7 +187,7 @@ def test_branch_subscopes_nofission():
 
     Pipeline([ScalarFission()]).apply_pass(sdfg, {})
 
-    assert set(sdfg.arrays.keys()) == {'A', 'B', 'C'}
+    assert set(sdfg.arrays.keys()).difference(sdfg.size_arrays()) == {'A', 'B', 'C'}
 
 def test_branch_subscopes_fission():
     sdfg = dace.SDFG('branch_subscope_fission')
@@ -279,7 +279,7 @@ def test_branch_subscopes_fission():
 
     Pipeline([ScalarFission()]).apply_pass(sdfg, {})
 
-    assert set(sdfg.arrays.keys()) == {'A', 'B', 'C', 'B_0', 'B_1'}
+    assert set(sdfg.arrays.keys()).difference(set(sdfg.size_arrays())) == {'A', 'B', 'C', 'B_0', 'B_1'}
 
 if __name__ == '__main__':
     test_scalar_fission()

From 15b00cc7bd03097ec328f0a92de6761e17fcbe1f Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 15:09:39 +0100
Subject: [PATCH 29/51] Fix size array name check in validation

---
 dace/sdfg/validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 2fbd95ae7e..a7ff2479d4 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -259,9 +259,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                     'rather than using multiple references to the same one', sdfg, None)
             references.add(id(desc))
 
-            if name.endswith("_size") and not hasattr(desc, "is_size_array"):
+            if name.endswith("_size") and hasattr(desc, "is_size_array") and desc.is_size_array is False:
                 raise InvalidSDFGEdgeError(
-                    f'Only size arrays allowed to end with _size'
+                    f'Only size arrays allowed to end with _size', sdfg, None
                 )
 
             # Because of how the code generator works Scalars can not be return values.

From ee8a70857d5e9f294272bd8a17881486a8d42d88 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 15:24:44 +0100
Subject: [PATCH 30/51] Various fixes

---
 dace/codegen/targets/framecode.py | 1 +
 dace/sdfg/sdfg.py                 | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index ad4fc524d4..b74867a5f8 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -971,6 +971,7 @@ def generate_code(self,
             array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
             assert (len(array) == 1)
             array = array[0]
+            print("AA", array, array.shape, sdfg.arrays)
             size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape])
             alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
             callsite_stream.write(alloc_str)
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 0015e56458..febc4a2885 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -779,13 +779,13 @@ def replace_dict(self,
             arr = self.arrays[arr_name] if arr_name in self.arrays else None
             if arr is not None:
                 size_desc_name_before = arr.size_desc_name
-                if arr.transient and isinstance(arr, dt.Array):
+                if arr.transient and type(arr) == dt.Array:
                     arr.size_desc_name = size_desc_name if "__return" not in new_name else None
                 else:
                     arr.size_desc_name = None
                 if arr.size_desc_name is None and size_desc_name_before is not None:
                     size_ararys_to_rm.add(size_desc_name_before)
-        for size_arr_name in size_ararys_to_rm:
+        for size_arr_name in size_ararys_to_rm and size_arr_name in self.arrays:
             del self.arrays[size_arr_name]
 
         # Replace inside data descriptors
@@ -2109,7 +2109,7 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
         self._add_symbols(datadesc)
         if (
             datadesc.transient is True and
-            isinstance(datadesc, dt.Array) and
+            type(datadesc) == dt.Array and
             "__return" not in name and
             datadesc.lifetime is not dtypes.AllocationLifetime.External and
             datadesc.lifetime is not dtypes.AllocationLifetime.Persistent

From f195e3fb7424ba60a1a6aa7ae8c93276da9caf1b Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 15:41:12 +0100
Subject: [PATCH 31/51] Fix validation case

---
 dace/sdfg/validation.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index a7ff2479d4..e5226375c9 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -259,9 +259,10 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                     'rather than using multiple references to the same one', sdfg, None)
             references.add(id(desc))
 
-            if name.endswith("_size") and hasattr(desc, "is_size_array") and desc.is_size_array is False:
-                raise InvalidSDFGEdgeError(
-                    f'Only size arrays allowed to end with _size', sdfg, None
+            if (name.endswith("_size") and desc.transient and type(desc) == dt.Array and
+                hasattr(desc, "is_size_array") and desc.is_size_array is False):
+                raise InvalidSDFGError(
+                    f'Only size arrays allowed to end with _size, desc: {desc}, storage: {desc.storage}, transient: {desc.transient}', sdfg, None
                 )
 
             # Because of how the code generator works Scalars can not be return values.

From e915607f665f000939254d0ce7bfd9b372c87859 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 6 Dec 2024 16:19:07 +0100
Subject: [PATCH 32/51] Improve filtering for size arrays

---
 dace/codegen/targets/cuda.py      |  2 +-
 dace/codegen/targets/framecode.py | 23 ++++++++++++++++-------
 dace/sdfg/sdfg.py                 | 18 ++++++++----------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 8efef5e9aa..eb205d30c8 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1582,7 +1582,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
                 if arr.transient and arr.storage == dtypes.StorageType.GPU_Global and arr.size_desc_name is not None:
                     size_arr_name = data_desc.size_desc_name
                     if size_arr_name is not None:
-                        size_arr = sdfg.arrays[data_desc.size_desc_name]
+                        size_arr = sdfg.arrays[size_arr_name]
                         host_size_args[size_arr_name] = size_arr
 
         kernel_args_typed = [('const ' if k in const_params else '') + v.as_arg(name=k)
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index b74867a5f8..1cb325042d 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -961,6 +961,8 @@ def generate_code(self,
 
         # After the symbols
         # Allocate size arrays (always check as name and array changes affect size descriptor names)
+        # Only allocate arrays that really require deferred allocation (symbol has __dace_defer)
+        # Reshaping these arrays are not allowed
         size_arrays = sdfg.size_arrays()
         callsite_stream.write(f'//Declare size arrays\n', sdfg)
         for size_desc_name in size_arrays:
@@ -969,13 +971,20 @@ def generate_code(self,
             ctypedef = size_nodedesc.dtype.ctype
             from dace.codegen.targets import cpp
             array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
-            assert (len(array) == 1)
-            array = array[0]
-            print("AA", array, array.shape, sdfg.arrays)
-            size_str = ",".join(["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape])
-            alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
-            callsite_stream.write(alloc_str)
-            self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
+            if len(array) != 1:
+                print(array)
+            assert len(array) <= 1
+            if len(array) == 1:
+                array = array[0]
+                if any(["__dace_defer" in str(dim) for dim in array.shape]):
+                    dimensions = ["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape]
+                    if any(["__dace_defer" in cpp.sym2cpp(dim) for dim in array.shape]):
+                        size_str = ",".join(dimensions)
+                        assert len(size_nodedesc.shape) == 1
+                        print("BB", size_nodedesc.shape, dimensions, array.shape)
+                        alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
+                        callsite_stream.write(alloc_str)
+                        self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
 
         #######################################################################
         # Generate actual program body
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index febc4a2885..7eafecaf6d 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -101,8 +101,8 @@ def _nested_arrays_from_json(obj, context=None):
     return NestedDict({k: dace.serialize.from_json(v, context) for k, v in obj.items()})
 
 
-def _replace_dict_keys(d, old, new, filter=None):
-    if old in d and (filter is None or old in filter):
+def _replace_dict_keys(d, old, new, filter_set=None):
+    if old in d and (filter_set is None or old in filter_set):
         if new in d:
             warnings.warn('"%s" already exists in SDFG' % new)
         d[new] = d[old]
@@ -763,26 +763,24 @@ def replace_dict(self,
             for name, new_name in repldict_filtered.items():
                 if validate_name(new_name):
                     _replace_dict_keys(self.arrays, name, new_name, non_size_arrays)
+                    # Size desc names are updated later
                     if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return
                         size_desc_map[new_name] = new_name + "_size"
-                        _replace_dict_keys(self.arrays, name + "_size", new_name + "_size", size_arrays)
                     _replace_dict_keys(self.symbols, name, new_name)
                     _replace_dict_keys(self.constants_prop, name, new_name)
                     _replace_dict_keys(self.callback_mapping, name, new_name)
                     _replace_dict_values(self.callback_mapping, name, new_name)
 
         # Update size descriptors
-        # Return_size break things (it is collected to the tuple of return) delete it from the arrays
+        # Having return_size break things (it is collected to the tuple of return) delete it from the arrays
         # If this is called because array's properties had been changed then set the size desc to none
         size_ararys_to_rm = set()
         for arr_name, size_desc_name in size_desc_map.items():
             arr = self.arrays[arr_name] if arr_name in self.arrays else None
             if arr is not None:
                 size_desc_name_before = arr.size_desc_name
-                if arr.transient and type(arr) == dt.Array:
+                if arr.transient and type(arr) == dt.Array and size_desc_name_before is not None:
                     arr.size_desc_name = size_desc_name if "__return" not in new_name else None
-                else:
-                    arr.size_desc_name = None
                 if arr.size_desc_name is None and size_desc_name_before is not None:
                     size_ararys_to_rm.add(size_desc_name_before)
         for size_arr_name in size_ararys_to_rm and size_arr_name in self.arrays:
@@ -2112,7 +2110,8 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
             type(datadesc) == dt.Array and
             "__return" not in name and
             datadesc.lifetime is not dtypes.AllocationLifetime.External and
-            datadesc.lifetime is not dtypes.AllocationLifetime.Persistent
+            datadesc.lifetime is not dtypes.AllocationLifetime.Persistent and
+            any(["__dace_defer" in str(dim) for dim in datadesc.shape])
             ):
             size_desc_name = f"{name}_size"
             # Regardless of the scope and storage it is allocated as a register array
@@ -2120,7 +2119,7 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
             # from failing. To lifetime and storage are set explicitly to
             # to prevent optimizations to putting them to FPGA/GPU storage
             size_desc = dt.Array(dtype=dace.uint64,
-                                shape=(len(datadesc.shape),),
+                                shape=(len(list(datadesc.shape)),),
                                 storage=dtypes.StorageType.CPU_Heap,
                                 location=None,
                                 allow_conflicts=False,
@@ -2130,7 +2129,6 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
                                 lifetime=dtypes.AllocationLifetime.State,
                                 alignment=datadesc.alignment,
                                 debuginfo=datadesc.debuginfo,
-                                total_size=len(datadesc.shape),
                                 may_alias=False,
                                 size_desc_name=None)
             size_desc.is_size_array = True

From 9d646dc71c8d83ef0a38300d096f45b5563b59c5 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Mon, 9 Dec 2024 11:29:05 +0100
Subject: [PATCH 33/51] Improve tests, improve deferred alloc check

---
 dace/codegen/targets/framecode.py | 16 +++---
 dace/data.py                      |  4 ++
 dace/sdfg/sdfg.py                 | 90 ++++++++++++++++++++-----------
 dace/sdfg/validation.py           | 18 +++++++
 tests/deferred_alloc_test.py      | 80 ++++++++++++++-------------
 5 files changed, 128 insertions(+), 80 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 1cb325042d..66b71bbb74 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -971,20 +971,16 @@ def generate_code(self,
             ctypedef = size_nodedesc.dtype.ctype
             from dace.codegen.targets import cpp
             array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
-            if len(array) != 1:
-                print(array)
             assert len(array) <= 1
             if len(array) == 1:
                 array = array[0]
-                if any(["__dace_defer" in str(dim) for dim in array.shape]):
+                if type(array) == dace.data.Array and array.is_deferred_array:
                     dimensions = ["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape]
-                    if any(["__dace_defer" in cpp.sym2cpp(dim) for dim in array.shape]):
-                        size_str = ",".join(dimensions)
-                        assert len(size_nodedesc.shape) == 1
-                        print("BB", size_nodedesc.shape, dimensions, array.shape)
-                        alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
-                        callsite_stream.write(alloc_str)
-                        self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
+                    size_str = ",".join(dimensions)
+                    assert len(size_nodedesc.shape) == 1
+                    alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
+                    callsite_stream.write(alloc_str)
+                    self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
 
         #######################################################################
         # Generate actual program body
diff --git a/dace/data.py b/dace/data.py
index f6c5a84417..a3b008f150 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -1388,6 +1388,7 @@ class Array(Data):
     pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred')
 
     is_size_array = Property(dtype=bool, default=False, desc='Special array that is used to track the size of an another array')
+    is_deferred_array = Property(dtype=bool, default=False, desc='Array that requires deferred allocation')
 
     def __init__(self,
                  dtype,
@@ -1440,6 +1441,9 @@ def __init__(self,
             self.offset = cp.copy(offset)
         else:
             self.offset = [0] * len(shape)
+
+        self.is_deferred_array = any(["__dace_defer" in str(dim) for dim in self.shape])
+
         self.validate()
 
     def __repr__(self):
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 7eafecaf6d..a08f572782 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -764,8 +764,10 @@ def replace_dict(self,
                 if validate_name(new_name):
                     _replace_dict_keys(self.arrays, name, new_name, non_size_arrays)
                     # Size desc names are updated later
-                    if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return
+                    if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return, fpga__return
                         size_desc_map[new_name] = new_name + "_size"
+                    else:
+                        size_desc_map[new_name] = None
                     _replace_dict_keys(self.symbols, name, new_name)
                     _replace_dict_keys(self.constants_prop, name, new_name)
                     _replace_dict_keys(self.callback_mapping, name, new_name)
@@ -779,12 +781,28 @@ def replace_dict(self,
             arr = self.arrays[arr_name] if arr_name in self.arrays else None
             if arr is not None:
                 size_desc_name_before = arr.size_desc_name
-                if arr.transient and type(arr) == dt.Array and size_desc_name_before is not None:
-                    arr.size_desc_name = size_desc_name if "__return" not in new_name else None
+                # If we change the name of an array, then we need to change its size array accordingly
+                if (arr.transient and type(arr) == dt.Array and size_desc_name_before is not None
+                    and size_desc_name is not None):
+                    arr.size_desc_name = size_desc_name
+                    assert (arr.size_desc_name == size_desc_name)
+                    self.arrays[size_desc_name]  = self.arrays.pop(size_desc_name_before)
+                # If the new size array is None, then we can remove the previous (and now unused size array)
                 if arr.size_desc_name is None and size_desc_name_before is not None:
                     size_ararys_to_rm.add(size_desc_name_before)
-        for size_arr_name in size_ararys_to_rm and size_arr_name in self.arrays:
-            del self.arrays[size_arr_name]
+                # If the new size array is not None, but it was non before we need to add the size array
+                if size_desc_name_before is None and arr.size_desc_name is not None:
+                    retval = self._get_size_arr(arr_name, arr)
+                    if retval is not None:
+                        size_desc_name, size_desc = retval
+                        assert (size_desc_name == arr.size_desc_name)
+                        self._arrays[size_desc_name] = size_desc
+                        self._add_symbols(size_desc)
+
+        # Rm any size array we need to remove
+        for size_arr_name in size_ararys_to_rm:
+            if size_arr_name in self.arrays:
+                del self.arrays[size_arr_name]
 
         # Replace inside data descriptors
         for array in self.arrays.values():
@@ -2062,6 +2080,37 @@ def _add_symbols(self, desc: dt.Data):
             if sym.name not in self.symbols:
                 self.add_symbol(sym.name, sym.dtype)
 
+    def _get_size_arr(self, name: str, datadesc: dt.Data):
+        if (
+            datadesc.transient is True and
+            type(datadesc) == dt.Array and
+            "__return" not in name and
+            datadesc.lifetime is not dtypes.AllocationLifetime.External and
+            datadesc.lifetime is not dtypes.AllocationLifetime.Persistent and
+            datadesc.is_deferred_array
+            ):
+            size_desc_name = f"{name}_size"
+            # Regardless of the scope and storage it is allocated as a register array
+            # And at the start of the SDFG (or nested SDFG), not setting SDFG prevents to_gpu assertions
+            # from failing. To lifetime and storage are set explicitly to
+            # to prevent optimizations to putting them to FPGA/GPU storage
+            size_desc = dt.Array(dtype=dace.uint64,
+                                shape=(len(datadesc.shape),),
+                                storage=dtypes.StorageType.CPU_Heap,
+                                location=None,
+                                allow_conflicts=False,
+                                transient=True,
+                                strides=(1,),
+                                offset=(0,),
+                                lifetime=dtypes.AllocationLifetime.State,
+                                alignment=datadesc.alignment,
+                                debuginfo=datadesc.debuginfo,
+                                may_alias=False,
+                                size_desc_name=None)
+            size_desc.is_size_array = True
+            return size_desc_name, size_desc
+        return None
+
     def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str:
         """ Adds an existing data descriptor to the SDFG array store.
 
@@ -2105,33 +2154,10 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
         # Add the data descriptor to the SDFG and all symbols that are not yet known.
         self._arrays[name] = datadesc
         self._add_symbols(datadesc)
-        if (
-            datadesc.transient is True and
-            type(datadesc) == dt.Array and
-            "__return" not in name and
-            datadesc.lifetime is not dtypes.AllocationLifetime.External and
-            datadesc.lifetime is not dtypes.AllocationLifetime.Persistent and
-            any(["__dace_defer" in str(dim) for dim in datadesc.shape])
-            ):
-            size_desc_name = f"{name}_size"
-            # Regardless of the scope and storage it is allocated as a register array
-            # And at the start of the SDFG (or nested SDFG), not setting SDFG prevents to_gpu assertions
-            # from failing. To lifetime and storage are set explicitly to
-            # to prevent optimizations to putting them to FPGA/GPU storage
-            size_desc = dt.Array(dtype=dace.uint64,
-                                shape=(len(list(datadesc.shape)),),
-                                storage=dtypes.StorageType.CPU_Heap,
-                                location=None,
-                                allow_conflicts=False,
-                                transient=True,
-                                strides=(1,),
-                                offset=(0,),
-                                lifetime=dtypes.AllocationLifetime.State,
-                                alignment=datadesc.alignment,
-                                debuginfo=datadesc.debuginfo,
-                                may_alias=False,
-                                size_desc_name=None)
-            size_desc.is_size_array = True
+
+        retval = self._get_size_arr(name, datadesc)
+        if retval is not None:
+            size_desc_name, size_desc = retval
             self._arrays[size_desc_name] = size_desc
             # In case find_new_name and a new name is returned
             # we need to update the size descriptor name of the array
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index e5226375c9..c4173dd181 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -306,6 +306,24 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                             "Arrays that use a multibank access pattern must have the size of the first dimension equal"
                             f" the number of banks and have at least 2 dimensions for array {name}", sdfg, None)
 
+            # Check the size array shapes match
+            if type(desc) == dt.Array:
+                if desc.is_size_array is False and desc.size_desc_name is not None:
+                    # It is an array which is not a size array and needs to have a size array
+                    size_desc = sdfg._arrays[desc.size_desc_name]
+                    size_arr_len = size_desc.shape[0]
+                    if not isinstance(size_arr_len, int) and  (isinstance(size_arr_len, dace.symbolic.symbol) and not size_arr_len.is_integer):
+                        raise InvalidSDFGError(
+                            f"Size arrays need to be one-dimensional and have an integer length known at compile time. {desc.size_desc_name}: {size_desc.shape}"
+                            , sdfg, None
+                        )
+                    # TODO: This check can be implemented as part of a getter/setter on the dimensions of the array?
+                    if int(size_arr_len) != len(desc.shape):
+                        raise InvalidSDFGError(
+                            f"Size arrays size needs to match to shape of its array: {desc.size_desc_name}, {size_desc.shape}: {name}, {desc.shape}"
+                            , sdfg, None
+                        )
+
         # Check if SDFG is located within a GPU kernel
         context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None)
         context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None)
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 2b4aa17717..adc5427a9a 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -20,7 +20,7 @@ def schedule_type(storage_type):
         return dace.dtypes.ScheduleType.GPU_Device
 
 def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"):
-    sdfg = dace.sdfg.SDFG(name="deferred_alloc_test")
+    sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_1")
 
     sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type, transient=transient)
 
@@ -37,9 +37,8 @@ def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bo
 
     return sdfg
 
-
 def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType.Default):
-    sdfg = dace.sdfg.SDFG(name="deferred_alloc_test_2")
+    sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_2")
 
     sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type,
                     lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient)
@@ -100,21 +99,20 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool,
 
     return sdfg
 
-
-def _valid_to_reallocate(transient, storage_type, scope):
+def _valid_to_reallocate(transient, storage_type):
     return transient and (storage_type == dace.dtypes.StorageType.GPU_Global or storage_type == dace.dtypes.StorageType.CPU_Heap)
 
-def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool):
+def _test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool):
     sdfg = _get_trivial_alloc_sdfg(storage_type, transient)
     try:
         sdfg.validate()
     except Exception:
-        if not _valid_to_reallocate(transient, storage_type, None):
+        if not _valid_to_reallocate(transient, storage_type):
             return
         else:
             raise AssertionError("Realloc with transient data failed when it was expected not to.")
 
-    if not _valid_to_reallocate(transient, storage_type, None):
+    if not _valid_to_reallocate(transient, storage_type):
         raise AssertionError("Realloc with non-transient data did not fail when it was expected to.")
 
     sdfg.compile()
@@ -124,17 +122,18 @@ def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool)
     sdfg.validate()
     sdfg.compile()
 
-def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType):
+
+def _test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType):
     sdfg = _get_assign_map_sdfg(storage_type, transient, schedule_type)
     try:
         sdfg.validate()
     except Exception:
-        if not _valid_to_reallocate(transient, storage_type, None):
+        if not _valid_to_reallocate(transient, storage_type):
             return
         else:
             raise AssertionError("Realloc-use with transient data failed when it was expected not to.")
 
-    if not _valid_to_reallocate(transient, storage_type, None):
+    if not _valid_to_reallocate(transient, storage_type):
         raise AssertionError("Realloc-use with non-transient data did not fail when it was expected to.")
 
     compiled_sdfg = sdfg.compile()
@@ -174,13 +173,23 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch
         compiled_sdfg(user_size=user_size, example_array=arr)
         assert ( arr.get()[0] == 3.0 )
 
-def test_realloc_inside_map():
-    pass
+@pytest.mark.gpu
+def test_realloc_use_gpu(transient: bool):
+    _test_realloc_use(dace.dtypes.StorageType.GPU_Global, transient, dace.dtypes.ScheduleType.GPU_Device)
 
+def test_realloc_use_cpu(transient: bool):
+    _test_realloc_use(dace.dtypes.StorageType.CPU_Heap, transient, dace.dtypes.ScheduleType.Sequential)
 
-def test_all_combinations(storage_type, transient, schedule_type):
-    test_trivial_realloc(storage_type, transient)
-    test_realloc_use(storage_type, transient, schedule_type)
+@pytest.mark.gpu
+def test_trivial_realloc_gpu(transient: bool):
+    _test_trivial_realloc(dace.dtypes.StorageType.GPU_Global, transient)
+
+def test_trivial_realloc_cpu(transient: bool):
+    _test_trivial_realloc(dace.dtypes.StorageType.CPU_Heap, transient)
+
+
+def test_realloc_inside_map():
+    pass
 
 def test_incomplete_write_dimensions_1():
     sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
@@ -202,28 +211,23 @@ def test_incomplete_write_dimensions_2():
 
 
 if __name__ == "__main__":
-    for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential),
-                                        (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]:
-        print(f"Trivial Realloc with storage {storage_type}")
-        test_trivial_realloc(storage_type, True)
-        print(f"Trivial Realloc-Use with storage {storage_type}")
-        test_realloc_use(storage_type, True, schedule_type)
-
-    for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential),
-                                        (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]:
-        print(f"Trivial Realloc with storage {storage_type} on non-transient data")
-        test_trivial_realloc(storage_type, False)
-        print(f"Trivial Realloc-Use with storage {storage_type} on non-transient data")
-        test_realloc_use(storage_type, False, schedule_type)
-
-    # Try some other combinations
-    for transient in [True, False]:
-        for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential),
-                                            (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]:
-            print(f"Trivial Realloc with storage {storage_type} on transient:{transient} data")
-            test_trivial_realloc(storage_type, transient)
-            print(f"Trivial Realloc-Use with storage {storage_type} on transient:{transient} data")
-            test_realloc_use(storage_type, transient, schedule_type)
+    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}")
+    test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, True)
+    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}")
+    test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.Sequential)
+    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}")
+    test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, True)
+    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}")
+    test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
+
+    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
+    test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, False)
+    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
+    test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, False, dace.dtypes.ScheduleType.Sequential)
+    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
+    test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, False)
+    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
+    test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, False, dace.dtypes.ScheduleType.GPU_Device)
 
     print(f"Realloc with incomplete write 1")
     test_incomplete_write_dimensions_1()

From 3854c82799ebc4b9a5aa7d47059edbb24f74e2e2 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Mon, 9 Dec 2024 12:11:46 +0100
Subject: [PATCH 34/51] Fix type check imports

---
 dace/codegen/targets/framecode.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 66b71bbb74..416bbe6ed5 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -23,7 +23,7 @@
 from dace.sdfg.analysis import cfg as cfg_analysis
 from dace.sdfg.state import ControlFlowRegion, LoopRegion
 from dace.transformation.passes.analysis import StateReachability, loop_analysis
-
+from dace.codegen.targets import cpp
 
 def _get_or_eval_sdfg_first_arg(func, sdfg):
     if callable(func):
@@ -969,12 +969,11 @@ def generate_code(self,
             size_nodedesc = sdfg.arrays[size_desc_name]
             assert ("__return" not in size_desc_name)
             ctypedef = size_nodedesc.dtype.ctype
-            from dace.codegen.targets import cpp
             array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
             assert len(array) <= 1
             if len(array) == 1:
                 array = array[0]
-                if type(array) == dace.data.Array and array.is_deferred_array:
+                if type(array) == data.Array and array.is_deferred_array:
                     dimensions = ["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape]
                     size_str = ",".join(dimensions)
                     assert len(size_nodedesc.shape) == 1

From 2408ad0816b2e2612902f45d399f7fa5b2a0b4d8 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Tue, 10 Dec 2024 17:24:58 +0100
Subject: [PATCH 35/51] Improve validation and type checks and fix bugs

---
 dace/codegen/targets/cpu.py  |  2 +-
 dace/codegen/targets/cuda.py |  7 ++--
 dace/data.py                 |  2 +-
 dace/sdfg/sdfg.py            |  2 +-
 dace/sdfg/validation.py      |  4 ++
 tests/deferred_alloc_test.py | 72 +++++++++++++++++++++++++++++++-----
 6 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 85c51cb8b1..2172bbc0da 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -344,7 +344,7 @@ def declare_array(self,
                 size_desc_name = sdfg.arrays[name].size_desc_name
                 if size_desc_name is not None:
                     size_desc = sdfg.arrays[size_desc_name]
-                    size_ctypedef = dtypes.pointer(size_desc.dtype).ctype
+                    size_ctypedef = size_desc.dtype.ctype
                     self._dispatcher.declared_arrays.add(size_desc_name, DefinedType.Pointer, size_ctypedef)
             return
         elif nodedesc.storage is dtypes.StorageType.CPU_ThreadLocal:
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index eb205d30c8..8bc51d0418 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1503,7 +1503,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         # make dynamic map inputs constant
         # TODO move this into _get_const_params(dfg_scope)
         # Do not add src as const if the size is being red (src_conn is _read_size)
-        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if not e.src_conn.endswith("size"))
+        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if e.src_conn is None or (e.src_conn is not None and e.src_conn == "_read_size"))
 
         # Store init/exit code streams
         old_entry_stream = self.scope_entry_stream
@@ -1626,8 +1626,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
             for i in range(size_arr.shape[0]):
                 if f"__{arr_name}_dim{i}_size" not in dyn_args:
                     dyn_args.append(f"__{arr_name}_dim{i}_size")
-                    dyn_args_typed.append(f"const {arg.dtype.ctype} __{arr_name}_dim{i}_size")
-                    needed_size_scalars_declaration.append(f"const {arg.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
+                    size_desc = sdfg.arrays[size_desc_name]
+                    dyn_args_typed.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size")
+                    needed_size_scalars_declaration.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
 
         self._localcode.write(
             '__global__ void %s %s(%s) {\n' %
diff --git a/dace/data.py b/dace/data.py
index a3b008f150..1678721062 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -1442,7 +1442,7 @@ def __init__(self,
         else:
             self.offset = [0] * len(shape)
 
-        self.is_deferred_array = any(["__dace_defer" in str(dim) for dim in self.shape])
+        self.is_deferred_array = any([str(dim).startswith("__dace_defer") for dim in self.shape])
 
         self.validate()
 
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index a08f572782..6e80270ea8 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -1795,7 +1795,7 @@ def add_array(self,
         # convert strings to int if possible, unless it is not the reserved symbol for deferred allocation
         newshape = []
         for i, s in enumerate(shape):
-            if isinstance(s, str) and s == "__dace_defer":
+            if isinstance(s, str) and s.startswith("__dace_defer"):
                 newshape.append(dace.symbolic.pystr_to_symbolic(f"{s}_dim{i}"))
             else:
                 try:
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index c4173dd181..55ed7570db 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -573,11 +573,15 @@ def validate_state(state: 'dace.sdfg.SDFGState',
             write_size_edges = list(state.edges_by_connector(node, insize))
 
             # Reading-Writing the size is valid only if the array is transient and has the storage type CPU_Heap or GPU_Global
+            has_writes = len(write_size_edges) > 0
             has_writes_or_reads = len(read_size_edges) + len(write_size_edges) > 0
             size_access_allowed = arr.transient and (arr.storage == dtypes.StorageType.CPU_Heap or arr.storage == dtypes.StorageType.GPU_Global)
             if has_writes_or_reads and not size_access_allowed:
                 raise InvalidSDFGNodeError('Reading the size of an array, or changing (writing to) the size of an array '
                                            'is only valid if the array is transient and the storage is CPU_Heap or GPU_Global', sdfg, state_id, nid)
+            if has_writes and scope[node] is not None:
+                raise InvalidSDFGNodeError('Resizing array is not allowed within a scope (e.g. not inside maps)', sdfg, state_id, nid)
+
 
             if len(write_size_edges) > 1:
                 raise InvalidSDFGNodeError('One node can have at maximum one edge writing to its size descriptior', sdfg, state_id, nid)
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index adc5427a9a..9aa8d86c14 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -4,6 +4,7 @@
 import numpy
 import pytest
 
+
 @pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global])
 def storage_type(request):
     return request.param
@@ -73,7 +74,7 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool,
     arrn = state.add_access(arr_name)
 
     if storage_type == dace.dtypes.StorageType.CPU_Heap:
-        assert (schedule_type == dace.dtypes.ScheduleType.Sequential)
+        assert (schedule_type == dace.dtypes.ScheduleType.Sequential or schedule_type == dace.dtypes.ScheduleType.CPU_Multicore)
     elif storage_type == dace.dtypes.StorageType.GPU_Global:
         assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device)
 
@@ -188,9 +189,55 @@ def test_trivial_realloc_cpu(transient: bool):
     _test_trivial_realloc(dace.dtypes.StorageType.CPU_Heap, transient)
 
 
-def test_realloc_inside_map():
+def _add_realloc_inside_map(sdfg: dace.SDFG, schedule_type: dace.dtypes.ScheduleType):
+    pre_state = sdfg.states()[0]
+    state = sdfg.add_state("s2")
+    sdfg.add_edge(pre_state, state, dace.InterstateEdge(None, None))
+
+    map_entry, map_exit = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,4,1)])},
+                                        schedule=schedule_type)
+    an_2 = state.add_access('A')
+    an_2.add_in_connector("_write_size")
+
+    t1 = state.add_tasklet(name="assign", inputs={}, outputs={"__out"}, code="_out=8")
+    t1.add_out_connector("__out")
+
+    _, _ = sdfg.add_array("tmp0", shape=(2, ), dtype=numpy.uint64, transient=True)
+    sca = state.add_access("tmp0")
+
+    state.add_edge(map_entry, None, t1, None, dace.Memlet(None))
+    state.add_edge(t1, "__out", sca, None, dace.Memlet("tmp0[0]"))
+    state.add_edge(sca, None, an_2, "_write_size", dace.Memlet("tmp0"))
+    state.add_edge(an_2, None, map_exit, None, dace.Memlet(None))
+
+
+def test_realloc_inside_map_gpu():
+    sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
+    _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.GPU_Device)
+    try:
+        sdfg.validate()
+    except Exception:
+        return
+
+    pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
+
+def test_realloc_inside_map_cpu():
+    sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore)
+    _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.CPU_Multicore)
+    try:
+        sdfg.validate()
+    except Exception:
+        return
+
+    pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
+
+def test_conditional_alloc_gpu():
+    pass
+
+def test_conditional_alloc_cpu():
     pass
 
+
 def test_incomplete_write_dimensions_1():
     sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
     try:
@@ -211,23 +258,28 @@ def test_incomplete_write_dimensions_2():
 
 
 if __name__ == "__main__":
+    print(f"Trivial Realloc within map {dace.dtypes.StorageType.CPU_Multicore}")
+    test_realloc_inside_map_cpu()
+    print(f"Trivial Realloc within map {dace.dtypes.StorageType.GPU_Device}")
+    test_realloc_inside_map_gpu()
+
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}")
-    test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, True)
+    test_trivial_realloc_cpu(True)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}")
-    test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.Sequential)
+    test_realloc_use_cpu(True)
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}")
-    test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, True)
+    test_trivial_realloc_gpu(True)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}")
-    test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
+    test_realloc_use_gpu(True)
 
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
-    test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, False)
+    test_trivial_realloc_cpu(False)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
-    test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, False, dace.dtypes.ScheduleType.Sequential)
+    test_realloc_use_cpu(False)
     print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
-    test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, False)
+    test_trivial_realloc_gpu(False)
     print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
-    test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, False, dace.dtypes.ScheduleType.GPU_Device)
+    test_realloc_use_gpu(False)
 
     print(f"Realloc with incomplete write 1")
     test_incomplete_write_dimensions_1()

From 62bc08c6bc01026ff93d05254409662c8223e2e7 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 11 Dec 2024 11:33:13 +0100
Subject: [PATCH 36/51] Build on top of the GPU codegen hack

---
 dace/codegen/targets/cuda.py      | 4 ++--
 dace/codegen/targets/framecode.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 8bc51d0418..8c0e7800a4 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1502,8 +1502,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
         const_params = _get_const_params(dfg_scope)
         # make dynamic map inputs constant
         # TODO move this into _get_const_params(dfg_scope)
-        # Do not add src as const if the size is being red (src_conn is _read_size)
-        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if e.src_conn is None or (e.src_conn is not None and e.src_conn == "_read_size"))
+        # Do not add src as const if the size is being read (src_conn is _read_size)
+        const_params |= set((str(e.src)) for e in dace.sdfg.dynamic_map_inputs(state, scope_entry) if e.src_conn is None or not (e.src_conn is not None and e.src_conn == "_read_size"))
 
         # Store init/exit code streams
         old_entry_stream = self.scope_entry_stream
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 416bbe6ed5..baf215d903 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -964,7 +964,6 @@ def generate_code(self,
         # Only allocate arrays that really require deferred allocation (symbol has __dace_defer)
         # Reshaping these arrays are not allowed
         size_arrays = sdfg.size_arrays()
-        callsite_stream.write(f'//Declare size arrays\n', sdfg)
         for size_desc_name in size_arrays:
             size_nodedesc = sdfg.arrays[size_desc_name]
             assert ("__return" not in size_desc_name)
@@ -979,7 +978,7 @@ def generate_code(self,
                     assert len(size_nodedesc.shape) == 1
                     alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
                     callsite_stream.write(alloc_str)
-                    self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef)
+                    self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef, allow_shadowing=True)
 
         #######################################################################
         # Generate actual program body

From f50382b20dbc1cee3b4333e60a455a0f8947984c Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 11 Dec 2024 13:38:48 +0100
Subject: [PATCH 37/51] Improve proposal according to PR comments, improve
 support for more complex shapes, add tests

---
 dace/codegen/dispatcher.py        |   3 -
 dace/codegen/targets/cpp.py       |  31 ++++++-
 dace/codegen/targets/cpu.py       |  63 ++++---------
 dace/codegen/targets/cuda.py      |   9 +-
 dace/codegen/targets/framecode.py |   3 +-
 dace/data.py                      |   2 +-
 dace/sdfg/sdfg.py                 |   3 +-
 dace/sdfg/state.py                |   3 +-
 dace/sdfg/validation.py           |  27 +++++-
 tests/deferred_alloc_test.py      | 147 +++++++++++++++++++++++++++++-
 10 files changed, 224 insertions(+), 67 deletions(-)

diff --git a/dace/codegen/dispatcher.py b/dace/codegen/dispatcher.py
index 2defa04680..c97e72b391 100644
--- a/dace/codegen/dispatcher.py
+++ b/dace/codegen/dispatcher.py
@@ -635,10 +635,7 @@ def dispatch_reallocate(self, src_node: nodes.Node, node: nodes.Node, edge: Mult
         state = cfg.state(state_id)
         target = self.get_reallocate_dispatcher(node, edge, sdfg, state)
         assert target is not None
-        if target is None:
-            return
 
-        # Dispatch reallocate
         self._used_targets.add(target)
         target.reallocate(sdfg, cfg, dfg, state_id, src_node, node, edge, function_stream, output_stream)
 
diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 494890089b..cfd1b202c7 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -548,8 +548,8 @@ def ndcopy_to_strided_copy(
         return None
 
 
-def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed_veclen=1, indices=None,
-                    deferred_size_names=None):
+def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None,
+                    packed_veclen=1, indices=None, deferred_size_names=None):
     """ Creates a C++ expression that can be added to a pointer in order
         to offset it to the beginning of the given subset and offset.
 
@@ -579,7 +579,7 @@ def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed
     if packed_veclen > 1:
         index /= packed_veclen
 
-    if not (deferred_size_names is None):
+    if deferred_size_names is not None:
         access_str_with_deferred_vars = sym2cpp(index)
         def replace_pattern(match):
             number = match.group(1)
@@ -591,6 +591,27 @@ def replace_pattern(match):
         return sym2cpp(index)
 
 
+def _get_deferred_size_names(desc, name):
+    if (desc.storage != dtypes.StorageType.GPU_Global and
+        desc.storage != dtypes.StorageType.CPU_Heap and
+        not desc.transient):
+        return None
+    def check_dace_defer(elements):
+        for elem in elements:
+            if "__dace_defer" in str(elem):
+                return True
+        return False
+    deferred_size_names = None
+    if check_dace_defer(desc.shape):
+        if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap:
+            deferred_size_names = []
+            for i, elem in enumerate(desc.shape):
+                if "__dace_defer" in str(elem):
+                    deferred_size_names.append(f"__{name}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]")
+                else:
+                    deferred_size_names.append(elem)
+    return deferred_size_names if deferred_size_names is not None and len(deferred_size_names) > 0 else None
+
 def cpp_array_expr(sdfg,
                    memlet,
                    with_brackets=True,
@@ -600,14 +621,14 @@ def cpp_array_expr(sdfg,
                    use_other_subset=False,
                    indices=None,
                    referenced_array=None,
-                   codegen=None,
-                   deferred_size_names=None):
+                   codegen=None):
     """ Converts an Indices/Range object to a C++ array access string. """
     subset = memlet.subset if not use_other_subset else memlet.other_subset
     s = subset if relative_offset else subsets.Indices(offset)
     o = offset if relative_offset else None
     desc : dace.Data = (sdfg.arrays[memlet.data] if referenced_array is None else referenced_array)
     desc_name = memlet.data
+    deferred_size_names = _get_deferred_size_names(desc, desc_name)
     offset_cppstr = cpp_offset_expr(desc, s, o, packed_veclen, indices=indices, deferred_size_names=deferred_size_names)
 
     # NOTE: Are there any cases where a mix of '.' and '->' is needed when traversing nested structs?
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 2172bbc0da..0798abc5e6 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -1,5 +1,6 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 from copy import deepcopy
+import re
 from dace.sdfg.graph import MultiConnectorEdge
 from dace.sdfg.state import ControlFlowRegion, SDFGState, StateSubgraphView
 import functools
@@ -404,7 +405,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
 
         # Compute array size
         arrsize = nodedesc.total_size
-        deferred_allocation = any([s for s in nodedesc.shape if str(s).startswith("__dace_defer")])
+        deferred_allocation = any([s for s in nodedesc.shape if "__dace_defer" in str(s)])
         arrsize_bytes = None
         if not isinstance(nodedesc.dtype, dtypes.opaque):
             arrsize_bytes = arrsize * nodedesc.dtype.bytes
@@ -703,15 +704,22 @@ def reallocate(
         dtype = sdfg.arrays[data_name].dtype
 
         # Only consider the offsets with __dace_defer in original dim
-        mask_array = [str(dim).startswith("__dace_defer") for dim in data.shape]
+        mask_array = ["__dace_defer" in str(dim) for dim in data.shape]
+
+        # In case the size does not only consist of a "__dace_defer" symbol but from an expression involving "__dace_defer"
+        # The size array is only updated with the symbol, and while calculating the expression, we only replace the __dace_defer_dim pattern
+        # With the corresponding access from the size array
+        new_size_strs = []
         for i, mask in enumerate(mask_array):
             if mask:
+                new_size_str = cpp.sym2cpp(data.shape[i])
+                pattern = r'__dace_defer_dim(\d+)'
+                new_size_strs.append(re.sub(pattern, lambda m: f'{new_size_array_name}[{m.group(1)}]', new_size_str))
                 callsite_stream.write(
                     f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
                 )
 
-        # Call realloc only after no __dace_defer is left in size_array ?
-        size_str = " * ".join([f"{size_array_name}[{i}]" for i in range(len(data.shape))])
+        size_str = " * ".join(new_size_strs)
         callsite_stream.write(
             f"{dst_node.data} = static_cast<{dtype} *>(std::realloc(static_cast<void *>({dst_node.data}), {size_str} * sizeof({dtype})));"
         )
@@ -749,34 +757,22 @@ def _emit_copy(
 
         if isinstance(dst_node, nodes.Tasklet):
             # Copy into tasklet
-            desc = sdfg.arrays[memlet.data]
-            deferred_size_names = self._get_deferred_size_names(desc, memlet)
             stream.write(
-                "    " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn], deferred_size_names=deferred_size_names),
+                "    " + self.memlet_definition(sdfg, memlet, False, vconn, dst_node.in_connectors[vconn]),
                 cfg,
                 state_id,
                 [src_node, dst_node],
             )
-            if deferred_size_names is not None:
-                stream.write(
-                    "// Size uses deferred allocation"
-                )
 
             return
         elif isinstance(src_node, nodes.Tasklet):
             # Copy out of tasklet
-            desc = sdfg.arrays[memlet.data]
-            deferred_size_names = self._get_deferred_size_names(desc, memlet)
             stream.write(
-                "    " + self.memlet_definition(sdfg, memlet, True, uconn, src_node.out_connectors[uconn], deferred_size_names=deferred_size_names),
+                "    " + self.memlet_definition(sdfg, memlet, True, uconn, src_node.out_connectors[uconn]),
                 cfg,
                 state_id,
                 [src_node, dst_node],
             )
-            if deferred_size_names is not None:
-                stream.write(
-                    "// Size uses deferred allocation"
-                )
             return
         else:  # Copy array-to-array
             src_nodedesc = src_node.desc(sdfg)
@@ -1044,27 +1040,6 @@ def write_and_resolve_expr(self, sdfg: SDFG, memlet: mmlt.Memlet, nc: bool, outn
         custom_reduction = cpp.unparse_cr(sdfg, memlet.wcr, dtype)
         return (f'dace::wcr_custom<{dtype.ctype}>:: template {func}({custom_reduction}, {ptr}, {inname})')
 
-    def _get_deferred_size_names(self, desc, memlet):
-        if (desc.storage != dtypes.StorageType.GPU_Global and
-            desc.storage != dtypes.StorageType.CPU_Heap and
-            not desc.transient):
-            return None
-        def check_dace_defer(elements):
-            for elem in elements:
-                if isinstance(elem, symbolic.symbol) and str(elem).startswith("__dace_defer"):
-                    return True
-            return False
-        deferred_size_names = None
-        if check_dace_defer(desc.shape):
-            if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap:
-                deferred_size_names = []
-                for i, elem in enumerate(desc.shape):
-                    if str(elem).startswith("__dace_defer"):
-                        deferred_size_names.append(f"__{memlet.data}_dim{i}_size" if desc.storage == dtypes.StorageType.GPU_Global else f"{desc.size_desc_name}[{i}]")
-                    else:
-                        deferred_size_names.append(elem)
-        return deferred_size_names if deferred_size_names is not None and len(deferred_size_names) > 0 else None
-
     def process_out_memlets(self,
                             sdfg: SDFG,
                             cfg: ControlFlowRegion,
@@ -1201,8 +1176,7 @@ def process_out_memlets(self,
                             # If the storage type if CPU_Heap or GPU_Global then it might be requiring deferred allocation
                             # We can check if the array requires sepcial access using A_size[0] (CPU) or __A_dim0_size (GPU0)
                             # by going through the shape and checking for symbols starting with __dace_defer
-                            deferred_size_names = self._get_deferred_size_names(desc, memlet)
-                            expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame, deferred_size_names=deferred_size_names)
+                            expr = cpp.cpp_array_expr(sdfg, memlet, codegen=self._frame)
                             write_expr = codegen.make_ptr_assignment(in_local_name, conntype, expr, desc_dtype)
 
                     # Write out
@@ -1339,8 +1313,7 @@ def memlet_definition(self,
                           local_name: str,
                           conntype: Union[data.Data, dtypes.typeclass] = None,
                           allow_shadowing: bool = False,
-                          codegen: 'CPUCodeGen' = None,
-                          deferred_size_names = None):
+                          codegen: 'CPUCodeGen' = None):
         # TODO: Robust rule set
         if conntype is None:
             raise ValueError('Cannot define memlet for "%s" without connector type' % local_name)
@@ -1389,7 +1362,7 @@ def memlet_definition(self,
                                 decouple_array_interfaces=decouple_array_interfaces)
 
         result = ''
-        expr = (cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame, deferred_size_names=deferred_size_names)
+        expr = (cpp.cpp_array_expr(sdfg, memlet, with_brackets=False, codegen=self._frame)
                 if var_type in [DefinedType.Pointer, DefinedType.StreamArray, DefinedType.ArrayInterface] else ptr)
 
         if expr != ptr:
@@ -1433,7 +1406,7 @@ def memlet_definition(self,
             if not memlet.dynamic and memlet.num_accesses == 1:
                 if not output:
                     if isinstance(desc, data.Stream) and desc.is_stream_array():
-                        index = cpp.cpp_offset_expr(desc, memlet.subset, deferred_size_names=deferred_size_names)
+                        index = cpp.cpp_offset_expr(desc, memlet.subset)
                         expr = f"{memlet.data}[{index}]"
                     result += f'{memlet_type} {local_name} = ({expr}).pop();'
                     defined = DefinedType.Scalar
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 8c0e7800a4..fb27a4d870 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -606,7 +606,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
         is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants)
         arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype)
         ctypedef = '%s *' % nodedesc.dtype.ctype
-        deferred_allocation = any([s for s in nodedesc.shape if str(s).startswith("__dace_defer")])
+        deferred_allocation = any([s for s in nodedesc.shape if "__dace_defer" in str(s)])
 
         # Different types of GPU arrays
         if nodedesc.storage == dtypes.StorageType.GPU_Global:
@@ -2794,7 +2794,8 @@ def reallocate(
         dtype = sdfg.arrays[data_name].dtype
 
         # Only consider the offsets with __dace_defer in original dim
-        mask_array = [str(dim).startswith("__dace_defer") for dim in data.shape]
+        mask_array = ["__dace_defer" in str(dim) for dim in data.shape]
+        print(mask_array)
 
         # Call realloc only after no __dace_defer is left in size_array (must be true)
         # Save new and old sizes before registering them, because we need both to compute the bound of the new array
@@ -2829,8 +2830,12 @@ def reallocate(
         s += "}\n"
         callsite_stream.write(s)
 
+        new_size_strs = []
         for i, mask in enumerate(mask_array):
             if mask:
+                new_size_str = cpp.sym2cpp(data.shape[i])
+                pattern = r'__dace_defer_dim(\d+)'
+                new_size_strs.append(re.sub(pattern, lambda m: f'{new_size_array_name}[{m.group(1)}]', new_size_str))
                 callsite_stream.write(
                     f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
                 )
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index baf215d903..a6f34789ac 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -973,7 +973,8 @@ def generate_code(self,
             if len(array) == 1:
                 array = array[0]
                 if type(array) == data.Array and array.is_deferred_array:
-                    dimensions = ["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape]
+                    # 0 is a placeholder value, it is not important what the value is
+                    dimensions = ["0" if "__dace_defer" in cpp.sym2cpp(dim) else cpp.sym2cpp(dim) for dim in array.shape]
                     size_str = ",".join(dimensions)
                     assert len(size_nodedesc.shape) == 1
                     alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
diff --git a/dace/data.py b/dace/data.py
index 1678721062..a3b008f150 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -1442,7 +1442,7 @@ def __init__(self,
         else:
             self.offset = [0] * len(shape)
 
-        self.is_deferred_array = any([str(dim).startswith("__dace_defer") for dim in self.shape])
+        self.is_deferred_array = any(["__dace_defer" in str(dim) for dim in self.shape])
 
         self.validate()
 
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 6e80270ea8..2bcf6bdca9 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -500,7 +500,6 @@ def __init__(self,
         self._parent_sdfg = None
         self._parent_nsdfg_node = None
         self._arrays = NestedDict()  # type: Dict[str, dt.Array]
-        self._arrays = NestedDict()
         self.arg_names = []
         self._labels: Set[str] = set()
         self.global_code = {'frame': CodeBlock("", dtypes.Language.CPP)}
@@ -1795,7 +1794,7 @@ def add_array(self,
         # convert strings to int if possible, unless it is not the reserved symbol for deferred allocation
         newshape = []
         for i, s in enumerate(shape):
-            if isinstance(s, str) and s.startswith("__dace_defer"):
+            if isinstance(s, str) and "__dace_defer" in s:
                 newshape.append(dace.symbolic.pystr_to_symbolic(f"{s}_dim{i}"))
             else:
                 try:
diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py
index 4ea3e9047d..baad771684 100644
--- a/dace/sdfg/state.py
+++ b/dace/sdfg/state.py
@@ -421,8 +421,7 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto
             # Trace through scope entry using IN_# -> OUT_#
             if isinstance(curedge.dst, (nd.EntryNode, nd.ExitNode)):
                 if curedge.dst_conn is None:
-                    #raise ValueError("Destination connector cannot be None for {}".format(curedge.dst))
-                    break
+                    raise ValueError("Destination connector cannot be None for {}".format(curedge.dst))
                 if not curedge.dst_conn.startswith("IN_"):  # Map variable
                     break
                 next_edge = next(e for e in state.out_edges(curedge.dst) if e.src_conn == "OUT_" + curedge.dst_conn[3:])
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index 55ed7570db..cce266b573 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -324,6 +324,29 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                             , sdfg, None
                         )
 
+            if isinstance(desc, dt.Array): #is_deferred_array and is_size_array are only defined for dt.Array
+                if desc.is_deferred_array:
+                    if desc.is_size_array:
+                        raise InvalidSDFGError(
+                            f"A deferred array can't be used as a size array for another array. Data descriptor name: {desc}."
+                            , sdfg, None
+                        )
+                    if not desc.transient:
+                        raise InvalidSDFGError(
+                            f"Deferred arrays need to be transient."
+                            , sdfg, None
+                        )
+                    if "__return" in name:
+                        raise InvalidSDFGError(
+                            f"Deferred arrays can't be returned. {desc} has __return in its name."
+                            , sdfg, None
+                        )
+                    if desc.storage is not dtypes.StorageType.GPU_Global and desc.storage is not dtypes.StorageType.CPU_Heap:
+                        raise InvalidSDFGError(
+                            f"Deferred arrays are supported only for {dtypes.StorageType.GPU_Global} and {dtypes.StorageType.CPU_Heap} storage types for {desc}."
+                            , sdfg, None
+                        )
+
         # Check if SDFG is located within a GPU kernel
         context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None)
         context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None)
@@ -349,7 +372,7 @@ def _accessible(sdfg: 'dace.sdfg.SDFG', container: str, context: Dict[str, bool]
     """
     Helper function that returns False if a data container cannot be accessed in the current SDFG context.
     """
-    storage = sdfg.arrays[container].storage if container in sdfg.arrays else sdfg.arrays[container].storage
+    storage = sdfg.arrays[container].storage
     if storage == dtypes.StorageType.GPU_Global or storage in dtypes.GPU_STORAGES:
         return context.get('in_gpu', False)
     if storage == dtypes.StorageType.FPGA_Global or storage in dtypes.FPGA_STORAGES:
@@ -929,7 +952,7 @@ def validate_state(state: 'dace.sdfg.SDFGState',
 
         # Check dimensionality of memory access
         if isinstance(e.data.subset, (sbs.Range, sbs.Indices)):
-            desc = sdfg.arrays[e.data.data] if e.data.data in sdfg.arrays else sdfg.arrays[e.data.data]
+            desc = sdfg.arrays[e.data.data]
             if e.data.subset.dims() != len(desc.shape):
                 raise InvalidSDFGEdgeError(
                     "Memlet subset uses the wrong dimensions"
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 9aa8d86c14..3a6ec97a9e 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -210,7 +210,6 @@ def _add_realloc_inside_map(sdfg: dace.SDFG, schedule_type: dace.dtypes.Schedule
     state.add_edge(sca, None, an_2, "_write_size", dace.Memlet("tmp0"))
     state.add_edge(an_2, None, map_exit, None, dace.Memlet(None))
 
-
 def test_realloc_inside_map_gpu():
     sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
     _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.GPU_Device)
@@ -231,12 +230,142 @@ def test_realloc_inside_map_cpu():
 
     pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
 
+def _get_conditional_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType, defer_expr_instead_of_symbol: bool = False):
+    sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_2")
+
+    if not defer_expr_instead_of_symbol:
+        sdfg.add_array(name="A", shape=("__dace_defer", "__dace_defer"), dtype=dace.float32, storage=storage_type,
+                        lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient)
+    else:
+        sdfg.add_array(name="A", shape=("4 * __dace_defer", "8 * __dace_defer"), dtype=dace.float32, storage=storage_type,
+                        lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient)
+
+    sdfg.add_scalar(name="path", transient=False, dtype=numpy.uint64)
+
+    start = sdfg.add_state("s1")
+    iftrue = sdfg.add_state("s1_0")
+    iffalse = sdfg.add_state("s1_1")
+    assigntrue = sdfg.add_state("s2_0")
+    assignfalse = sdfg.add_state("s2_1")
+    state = sdfg.add_state("s3")
+
+    sdfg.add_edge(start, iftrue, dace.InterstateEdge("path == 1"))
+    sdfg.add_edge(start, iffalse, dace.InterstateEdge("path != 1"))
+    sdfg.add_edge(iftrue, assigntrue, dace.InterstateEdge(None))
+    sdfg.add_edge(iffalse, assignfalse, dace.InterstateEdge(None))
+    sdfg.add_edge(assigntrue, state, dace.InterstateEdge(None))
+    sdfg.add_edge(assignfalse, state, dace.InterstateEdge(None))
+
+    s1name, s1 = sdfg.add_array(name="size1", shape=(2,), dtype=numpy.uint64, storage=dace.dtypes.StorageType.Register,
+                    lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=False)
+    s2name, s2 = sdfg.add_array(name="size2", shape=(2,), dtype=numpy.uint64, storage=dace.dtypes.StorageType.Register,
+                    lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=False)
+
+    an_2_0 = assigntrue.add_access('A')
+    an_2_0.add_in_connector('_write_size')
+    an_u_2_0 = assigntrue.add_access("size1")
+    assigntrue.add_edge(an_u_2_0, None, an_2_0, "_write_size", dace.memlet.Memlet("size1"))
+
+    an_2_1 = assignfalse.add_access('A')
+    an_2_1.add_in_connector('_write_size')
+    an_u_2_1 = assignfalse.add_access("size2")
+    assignfalse.add_edge(an_u_2_1, None, an_2_1, "_write_size", dace.memlet.Memlet("size2"))
+
+    if storage_type == dace.dtypes.StorageType.CPU_Heap:
+        assert (schedule_type == dace.dtypes.ScheduleType.Sequential or schedule_type == dace.dtypes.ScheduleType.CPU_Multicore)
+    elif storage_type == dace.dtypes.StorageType.GPU_Global:
+        assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device)
+
+    an_3 = state.add_access('A')
+    an_3.add_out_connector('_read_size')
+    map_entry, map_exit = state.add_map(name="map",ndrange={"i":dace.subsets.Range([(0,"__A_0-1",1)]),
+                                                            "j":dace.subsets.Range([(0,"__A_1-1", 1)])},
+                                            schedule=schedule_type)
+    state.add_edge(an_3, '_read_size', map_entry, "__A_0", dace.Memlet(expr="A_size[0]"))
+    state.add_edge(an_3, '_read_size', map_entry, "__A_1", dace.Memlet(expr="A_size[1]"))
+    map_entry.add_in_connector("__A_0")
+    map_entry.add_in_connector("__A_1")
+    map_exit.add_in_connector("IN_A")
+    map_exit.add_out_connector("OUT_A")
+
+    t1 = state.add_tasklet(name="assign", inputs={}, outputs={"_out"}, code="_out=3.0")
+    state.add_edge(map_entry, None, t1, None, dace.Memlet(None))
+    state.add_edge(t1, "_out", map_exit, "IN_A", dace.Memlet(expr="A[i, j]"))
+
+    an_4 = state.add_access('A')
+    state.add_edge(map_exit, "OUT_A", an_4, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,"__A_0-1", 1), (0,"__A_1-1", 1)])))
+
+    an_4.add_out_connector('_read_size')
+    map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,"__A_0-1",1)]),"j":dace.subsets.Range([(0,"__A_1-1", 1)])},
+                                            schedule=schedule_type)
+    state.add_edge(an_4, '_read_size', map_entry2, "__A_0", dace.Memlet(expr="A_size[0]"))
+    state.add_edge(an_4, '_read_size', map_entry2, "__A_1", dace.Memlet(expr="A_size[1]"))
+    state.add_edge(an_4, None, map_entry2, "IN_A", dace.Memlet(expr="A[0:__A_0, 0:__A_1]"))
+    map_entry2.add_in_connector("__A_0")
+    map_entry2.add_in_connector("__A_1")
+    map_entry2.add_in_connector("IN_A")
+    map_entry2.add_out_connector("OUT_A")
+    map_exit2.add_in_connector("IN_A")
+    map_exit2.add_out_connector("OUT_A")
+
+    t2 = state.add_tasklet(name="check", inputs={"_in"}, outputs={"_out"}, code='_out = _in', language=dace.dtypes.Language.Python)
+    state.add_edge(map_entry2, "OUT_A", t2, "_in", dace.Memlet(expr="A[i, j]"))
+    state.add_edge(t2, "_out", map_exit2, "IN_A", dace.Memlet(expr="A[i, j]"))
+
+    an_5 = state.add_access('A')
+    state.add_edge(map_exit2, "OUT_A", an_5, None, dace.Memlet(data="A", subset=dace.subsets.Range([(0,"__A_0-1", 1), (0,"__A_1-1", 1)])))
+
+    arr_name, arr = sdfg.add_array(name="example_array", dtype=dace.float32, shape=(1,), transient=False, storage=storage_type)
+    arrn = state.add_access(arr_name)
+    state.add_edge(an_5, None, arrn, None, dace.memlet.Memlet("A[0, 0]"))
+
+    return sdfg
+
 def test_conditional_alloc_gpu():
-    pass
+    sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
+    sdfg.validate()
+    size1 = numpy.array([1, 1]).astype(numpy.uint64)
+    size2 = numpy.array([22, 22]).astype(numpy.uint64)
+    try:
+        import cupy
+    except Exception:
+        return
+
+    arr = cupy.array([-1.0]).astype(cupy.float32)
+    sdfg(path=1, size1=size1, size2=size2, example_array=arr)
+    assert ( arr.get()[0] == 3.0 )
 
 def test_conditional_alloc_cpu():
-    pass
+    sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore)
+    sdfg.validate()
+    size1 = numpy.array([1, 1]).astype(numpy.uint64)
+    size2 = numpy.array([22, 22]).astype(numpy.uint64)
+    arr = numpy.array([-1.0]).astype(numpy.float32)
+    sdfg(path=0, size1=size1, size2=size2, example_array=arr)
+    assert ( arr[0] == 3.0 )
+
+def test_conditional_alloc_with_expr_gpu():
+    sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device, True)
+    sdfg.validate()
+    size1 = numpy.array([1, 1]).astype(numpy.uint64)
+    size2 = numpy.array([22, 22]).astype(numpy.uint64)
+    try:
+        import cupy
+    except Exception:
+        return
+
+    arr = cupy.array([-1.0]).astype(cupy.float32)
+    sdfg(path=1, size1=size1, size2=size2, example_array=arr)
+    assert ( arr.get()[0] == 3.0 )
 
+def test_conditional_alloc_with_expr_cpu():
+    sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore, True)
+    sdfg.validate()
+    size1 = numpy.array([1, 1]).astype(numpy.uint64)
+    size2 = numpy.array([22, 22]).astype(numpy.uint64)
+    arr = numpy.array([-1.0]).astype(numpy.float32)
+    sdfg(path=0, size1=size1, size2=size2, example_array=arr)
+    assert ( arr[0] == 3.0 )
 
 def test_incomplete_write_dimensions_1():
     sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2")
@@ -284,4 +413,14 @@ def test_incomplete_write_dimensions_2():
     print(f"Realloc with incomplete write 1")
     test_incomplete_write_dimensions_1()
     print(f"Realloc with incomplete write 2")
-    test_incomplete_write_dimensions_2()
\ No newline at end of file
+    test_incomplete_write_dimensions_2()
+
+    print(f"Test conditional alloc with use cpu")
+    test_conditional_alloc_cpu()
+    print(f"Test conditional alloc with use gpu")
+    test_conditional_alloc_gpu()
+
+    print(f"Test conditional alloc with use and the shape as a non-trivial expression cpu")
+    test_conditional_alloc_with_expr_cpu()
+    print(f"Test conditional alloc with use and the shape as a non-trivial expression gpu")
+    test_conditional_alloc_with_expr_gpu()
\ No newline at end of file

From 8c2f12d2e8d51f065cf80a9a3bc9d0fb99972bff Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 11 Dec 2024 14:09:52 +0100
Subject: [PATCH 38/51] Add tests, refactor, improve size calculation

---
 dace/codegen/targets/cpp.py  | 23 +++++++++++++++++++++++
 dace/codegen/targets/cpu.py  | 21 ++++++---------------
 dace/codegen/targets/cuda.py | 23 +++++++++--------------
 tests/deferred_alloc_test.py | 34 +++++++++++++++++-----------------
 4 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index cfd1b202c7..26b34637a3 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -612,6 +612,29 @@ def check_dace_defer(elements):
                     deferred_size_names.append(elem)
     return deferred_size_names if deferred_size_names is not None and len(deferred_size_names) > 0 else None
 
+def _get_realloc_dimensions(size_array_name:str, new_size_array_name:str, shape):
+    # Only consider the offsets with __dace_defer in original dim
+    mask_array = ["__dace_defer" in str(dim) for dim in shape]
+
+    # In case the size does not only consist of a "__dace_defer" symbol but from an expression involving "__dace_defer"
+    # The size array is only updated with the symbol, and while calculating the expression, we only replace the __dace_defer_dim pattern
+    # With the corresponding access from the size array
+    size_assignment_strs = []
+    new_size_strs = []
+    old_size_strs = []
+    for i, mask in enumerate(mask_array):
+        if mask:
+            new_size_str = sym2cpp(shape[i])
+            pattern = r'__dace_defer_dim(\d+)'
+            new_size_strs.append(re.sub(pattern, lambda m: f'{new_size_array_name}[{m.group(1)}]', new_size_str))
+            old_size_strs.append(re.sub(pattern, lambda m: f"{size_array_name}[{m.group(1)}]", new_size_str))
+            size_assignment_strs.append(
+                f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
+            )
+        else:
+            new_size_strs.append(sym2cpp(shape[i]))
+    return size_assignment_strs, new_size_strs, old_size_strs
+
 def cpp_array_expr(sdfg,
                    memlet,
                    with_brackets=True,
diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 0798abc5e6..6b052fb577 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -703,21 +703,12 @@ def reallocate(
 
         dtype = sdfg.arrays[data_name].dtype
 
-        # Only consider the offsets with __dace_defer in original dim
-        mask_array = ["__dace_defer" in str(dim) for dim in data.shape]
-
-        # In case the size does not only consist of a "__dace_defer" symbol but from an expression involving "__dace_defer"
-        # The size array is only updated with the symbol, and while calculating the expression, we only replace the __dace_defer_dim pattern
-        # With the corresponding access from the size array
-        new_size_strs = []
-        for i, mask in enumerate(mask_array):
-            if mask:
-                new_size_str = cpp.sym2cpp(data.shape[i])
-                pattern = r'__dace_defer_dim(\d+)'
-                new_size_strs.append(re.sub(pattern, lambda m: f'{new_size_array_name}[{m.group(1)}]', new_size_str))
-                callsite_stream.write(
-                    f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
-                )
+        size_assignment_strs, new_size_strs, _ = cpp._get_realloc_dimensions(
+            size_array_name, new_size_array_name, data.shape
+        )
+
+        for size_assignment in size_assignment_strs:
+            callsite_stream.write(size_assignment)
 
         size_str = " * ".join(new_size_strs)
         callsite_stream.write(
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index fb27a4d870..2222c2a002 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1,6 +1,7 @@
 # Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
 import ctypes
 import functools
+import re
 import warnings
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
 
@@ -2793,15 +2794,16 @@ def reallocate(
 
         dtype = sdfg.arrays[data_name].dtype
 
-        # Only consider the offsets with __dace_defer in original dim
-        mask_array = ["__dace_defer" in str(dim) for dim in data.shape]
-        print(mask_array)
+        size_assignment_strs, new_size_strs, old_size_strs = cpp._get_realloc_dimensions(
+            size_array_name, new_size_array_name, data.shape
+        )
+
 
         # Call realloc only after no __dace_defer is left in size_array (must be true)
         # Save new and old sizes before registering them, because we need both to compute the bound of the new array
-        old_size_str = " * ".join([f"{size_array_name}[{i}]" for i in range(len(data.shape))])
+        old_size_str = " * ".join(old_size_strs)
         old_size_str += f" * sizeof({dtype.ctype})"
-        new_size_str = " * ".join([f"{new_size_array_name}[{i}]" if mask_array[i] else f"{size_array_name}[{i}]" for i in range(len(data.shape)) ])
+        new_size_str = " * ".join(new_size_strs)
         new_size_str += f" * sizeof({dtype.ctype})"
         tmp_storage_name = "__tmp_realloc_move_storage"
 
@@ -2830,15 +2832,8 @@ def reallocate(
         s += "}\n"
         callsite_stream.write(s)
 
-        new_size_strs = []
-        for i, mask in enumerate(mask_array):
-            if mask:
-                new_size_str = cpp.sym2cpp(data.shape[i])
-                pattern = r'__dace_defer_dim(\d+)'
-                new_size_strs.append(re.sub(pattern, lambda m: f'{new_size_array_name}[{m.group(1)}]', new_size_str))
-                callsite_stream.write(
-                    f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
-                )
+        for size_assignment in size_assignment_strs:
+            callsite_stream.write(size_assignment)
 
 ########################################################################
 ########################################################################
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 3a6ec97a9e..eee4482ae5 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -387,40 +387,40 @@ def test_incomplete_write_dimensions_2():
 
 
 if __name__ == "__main__":
-    print(f"Trivial Realloc within map {dace.dtypes.StorageType.CPU_Multicore}")
+    print(f"Trivial Realloc within map, cpu")
     test_realloc_inside_map_cpu()
-    print(f"Trivial Realloc within map {dace.dtypes.StorageType.GPU_Device}")
+    print(f"Trivial Realloc within map, gpu")
     test_realloc_inside_map_gpu()
 
-    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}")
+    print(f"Trivial Realloc with storage, cpu")
     test_trivial_realloc_cpu(True)
-    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}")
+    print(f"Trivial Realloc-Use with storage, cpu")
     test_realloc_use_cpu(True)
-    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}")
+
+    print(f"Trivial Realloc with storage, gpu")
     test_trivial_realloc_gpu(True)
-    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}")
+    print(f"Trivial Realloc-Use with storage, gpu")
     test_realloc_use_gpu(True)
-
-    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
+    print(f"Trivial Realloc with storage, cpu, on non-transient data")
     test_trivial_realloc_cpu(False)
-    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}  on non-transient data")
+    print(f"Trivial Realloc-Use with storage, cpu, on non-transient data")
     test_realloc_use_cpu(False)
-    print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
+    print(f"Trivial Realloc with storage, gpu, on non-transient data")
     test_trivial_realloc_gpu(False)
-    print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}  on non-transient data")
+    print(f"Trivial Realloc-Use with storage, gpu, on non-transient data")
     test_realloc_use_gpu(False)
 
-    print(f"Realloc with incomplete write 1")
+    print(f"Realloc with incomplete write one, validation")
     test_incomplete_write_dimensions_1()
-    print(f"Realloc with incomplete write 2")
+    print(f"Realloc with incomplete write two, validation")
     test_incomplete_write_dimensions_2()
 
-    print(f"Test conditional alloc with use cpu")
+    print(f"Test conditional alloc with use, cpu")
     test_conditional_alloc_cpu()
-    print(f"Test conditional alloc with use gpu")
+    print(f"Test conditional alloc with use, gpu")
     test_conditional_alloc_gpu()
 
-    print(f"Test conditional alloc with use and the shape as a non-trivial expression cpu")
+    print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu")
     test_conditional_alloc_with_expr_cpu()
-    print(f"Test conditional alloc with use and the shape as a non-trivial expression gpu")
+    print(f"Test conditional alloc with use and the shape as a non-trivial expression, gpu")
     test_conditional_alloc_with_expr_gpu()
\ No newline at end of file

From ede27040fb33a8bc21138858564decf84a5bf4eb Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 11 Dec 2024 14:15:41 +0100
Subject: [PATCH 39/51] Add array length checks to cutout test

---
 tests/sdfg/cutout_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/sdfg/cutout_test.py b/tests/sdfg/cutout_test.py
index 3ce4db5ac8..0049e460b5 100644
--- a/tests/sdfg/cutout_test.py
+++ b/tests/sdfg/cutout_test.py
@@ -21,6 +21,7 @@ def simple_matmul(A: dace.float64[20, 20], B: dace.float64[20, 20]):
     cut_sdfg = SDFGCutout.singlestate_cutout(state, node)
     assert cut_sdfg.number_of_nodes() == 1
     assert cut_sdfg.node(0).number_of_nodes() == 4
+    assert len(set(cut_sdfg.arrays.keys()).difference(set(cut_sdfg.size_arrays()))) == 3
     assert all(not a.transient for a in cut_sdfg.arrays.values())
 
 
@@ -41,6 +42,7 @@ def simple_matmul(A: dace.float64[20, 20], B: dace.float64[20, 20]):
     cut_sdfg = SDFGCutout.singlestate_cutout(state, *nodes)
     assert cut_sdfg.number_of_nodes() == 1
     assert cut_sdfg.node(0).number_of_nodes() == 7
+    assert len(set(cut_sdfg.arrays.keys()).difference(set(cut_sdfg.size_arrays()))) == 5
     assert (not any(a.transient for a in cut_sdfg.arrays.values()))
 
 
@@ -307,6 +309,7 @@ def test_input_output_configuration():
     assert ct.arrays['tmp2'].transient == False
     assert ct.arrays['tmp3'].transient == True
     assert ct.arrays['tmp4'].transient == True
+    assert len(set(ct.arrays.keys()).difference(set(ct.size_arrays()))) == 4
 
 
 def test_minimum_cut_simple_no_further_input_config():

From a6163c0252227ceba9cdd7b466b5c2db81b9bed0 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Wed, 11 Dec 2024 14:18:35 +0100
Subject: [PATCH 40/51] Refactor

---
 dace/sdfg/sdfg.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 2bcf6bdca9..8f0cce125b 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -418,10 +418,6 @@ class SDFG(ControlFlowRegion):
                        desc="Data descriptors for this SDFG",
                        to_json=_arrays_to_json,
                        from_json=_nested_arrays_from_json)
-    _arrays = Property(dtype=NestedDict,
-                       desc="Data size descriptors for this SDFG",
-                       to_json=_arrays_to_json,
-                       from_json=_nested_arrays_from_json)
     symbols = DictProperty(str, dtypes.typeclass, desc="Global symbols for this SDFG")
 
     instrument = EnumProperty(dtype=dtypes.InstrumentationType,
@@ -1733,13 +1729,11 @@ def _find_new_name(self, name: str):
         """ Tries to find a new name by adding an underscore and a number. """
 
         names = (self._arrays.keys() | self.constants_prop.keys() | self._pgrids.keys() | self._subarrays.keys()
-                 | self._rdistrarrays.keys() | self.symbols.keys() | self._arrays.keys())
+                 | self._rdistrarrays.keys() | self.symbols.keys())
         return dt.find_new_name(name, names)
 
     def is_name_used(self, name: str) -> bool:
         """ Checks if `name` is already used inside the SDFG."""
-        if name in self._arrays:
-            return True
         if name in self._arrays:
             return True
         if name in self.symbols:
@@ -2139,7 +2133,7 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str
         else:
             # We do not check for data constant, because there is a link between the constants and
             #  the data descriptors.
-            if name in self.arrays or name in self.arrays:
+            if name in self.arrays:
                 raise FileExistsError(f'Data descriptor "{name}" already exists in SDFG')
             if name in self.symbols:
                 raise FileExistsError(f'Can not create data descriptor "{name}", the name is used by a symbol.')

From ae084592419827cf43e8e6c4fe330d04751c6ff3 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 13 Dec 2024 15:47:40 +0100
Subject: [PATCH 41/51] Refactor and support CPU_Pinned

---
 dace/codegen/targets/cpp.py  |  5 +--
 dace/codegen/targets/cuda.py | 54 ++++++++++++++++---------
 dace/sdfg/validation.py      |  8 ++--
 tests/deferred_alloc_test.py | 77 ++++++++++++++++++++++++++++++++----
 4 files changed, 112 insertions(+), 32 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 26b34637a3..8357ca1fa8 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -592,8 +592,7 @@ def replace_pattern(match):
 
 
 def _get_deferred_size_names(desc, name):
-    if (desc.storage != dtypes.StorageType.GPU_Global and
-        desc.storage != dtypes.StorageType.CPU_Heap and
+    if (desc.storage not in dtypes.REALLOCATABLE_STORAGES and
         not desc.transient):
         return None
     def check_dace_defer(elements):
@@ -603,7 +602,7 @@ def check_dace_defer(elements):
         return False
     deferred_size_names = None
     if check_dace_defer(desc.shape):
-        if desc.storage == dtypes.StorageType.GPU_Global or desc.storage == dtypes.StorageType.CPU_Heap:
+        if desc.storage in dtypes.REALLOCATABLE_STORAGES:
             deferred_size_names = []
             for i, elem in enumerate(desc.shape):
                 if "__dace_defer" in str(elem):
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 2eacaae132..418cbbfdbd 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -136,6 +136,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
                 dispatcher.register_copy_dispatcher(dtypes.StorageType.Register, st, sched_type, illegal_copy)
 
         dispatcher.register_reallocate_dispatcher(dtypes.StorageType.GPU_Global, self)
+        dispatcher.register_reallocate_dispatcher(dtypes.StorageType.CPU_Pinned, self)
         # End of illegal copies
         # End of dispatcher registration
         ######################################
@@ -606,11 +607,12 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
         arrsize = nodedesc.total_size
         is_dynamically_sized = symbolic.issymbolic(arrsize, sdfg.constants)
         arrsize_malloc = '%s * sizeof(%s)' % (sym2cpp(arrsize), nodedesc.dtype.ctype)
+
         ctypedef = '%s *' % nodedesc.dtype.ctype
         deferred_allocation = any([s for s in nodedesc.shape if "__dace_defer" in str(s)])
 
-        # Different types of GPU arrays
-        if nodedesc.storage == dtypes.StorageType.GPU_Global:
+        # Different types of GPU arrays)
+        if nodedesc.storage in dtypes.REALLOCATABLE_STORAGES:
             if not declared:
                 declaration_stream.write('%s %s;\n' % (ctypedef, dataname))
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
@@ -2808,27 +2810,43 @@ def reallocate(
         tmp_storage_name = "__tmp_realloc_move_storage"
 
         callsite_stream.write(f"if ({dst_node.data} == nullptr) {{", cfg, state_id, dst_node.guid)
-        self._alloc_gpu_global(dst_node, data, callsite_stream, data_name, new_size_str)
+        if data.storage  == dtypes.StorageType.GPU_Global:
+            assert data.storage == dtypes.StorageType.CPU_Pinned
+            self._alloc_gpu_global(dst_node, data, callsite_stream, data_name, new_size_str)
+        else:
+            callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast<void**>(&{data_name}), {new_size_str}));", cfg, state_id, dst_node.guid)
         callsite_stream.write("} else {\n", cfg, state_id, dst_node.guid)
         callsite_stream.write(f"{dtype}* {tmp_storage_name};")
-        self._alloc_gpu_global(None, data, callsite_stream, tmp_storage_name, new_size_str)
+        if data.storage  == dtypes.StorageType.GPU_Global:
+            self._alloc_gpu_global(None, data, callsite_stream, tmp_storage_name, new_size_str)
+        else:
+            assert data.storage == dtypes.StorageType.CPU_Pinned
+            callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast<void**>(&{tmp_storage_name}), {new_size_str}));", cfg, state_id, dst_node.guid)
+
         s = ""
-        if not data.pool:  # If pooled, will be freed somewhere else
-            copy_size_str = f"Min({old_size_str}, {new_size_str})"
-            s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, cudaMemcpyDeviceToDevice));\n"
-            s += f"DACE_GPU_CHECK({self.backend}Free({data_name}));\n"
+        copy_size_str = f"Min({old_size_str}, {new_size_str})"
+        if data.storage == dtypes.StorageType.GPU_Global:
+            if not data.pool:  # If pooled, will be freed somewhere else
+                s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, cudaMemcpyDeviceToDevice));\n"
+                s += f"DACE_GPU_CHECK({self.backend}Free({data_name}));\n"
+                s += f"{data_name} = {tmp_storage_name};\n"
+            else:
+                cudastream = getattr(dst_node, '_cuda_stream', 'nullptr')
+                if cudastream != 'nullptr':
+                    cudastream = f'__state->gpu_context->streams[{cudastream}]'
+                s += f'DACE_GPU_CHECK({self.backend}MallocAsync(reinterpret_cast<void**>(&{tmp_storage_name}), {new_size_str}, {cudastream}));\n'
+                s += f"DACE_GPU_CHECK({self.backend}MemcpyAsync(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, {cudastream}), cudaMemcpyDeviceToDevice));\n"
+                s += f"DACE_GPU_CHECK({self.backend}FreeAsync({data_name}, {cudastream}));\n"
+                callsite_stream.write(s)
+                self._emit_sync(callsite_stream)
+                callsite_stream.write(f"{data_name} = {tmp_storage_name};\n")
+                s = ""
+        elif data.storage == dtypes.StorageType.CPU_Pinned:
+            s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, cudaMemcpyHostToHost));\n"
+            s += f"DACE_GPU_CHECK({self.backend}FreeHost({data_name}));\n"
             s += f"{data_name} = {tmp_storage_name};\n"
         else:
-            cudastream = getattr(dst_node, '_cuda_stream', 'nullptr')
-            if cudastream != 'nullptr':
-                cudastream = f'__state->gpu_context->streams[{cudastream}]'
-            s += f'DACE_GPU_CHECK({self.backend}MallocAsync(static_cast<void**>(&{data_name}), {new_size_str}, {cudastream}));\n'
-            s += f"DACE_GPU_CHECK({self.backend}MemcpyAsync(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, {cudastream}), cudaMemcpyDeviceToDevice));\n"
-            s += f"DACE_GPU_CHECK({self.backend}FreeAsync({data_name}, {cudastream}));\n"
-            callsite_stream.write(s)
-            self._emit_sync(callsite_stream)
-            callsite_stream.write(f"{data_name} = {tmp_storage_name};\n")
-            s = ""
+            raise Exception("Realloc in CUDA, storage type must be CPU_Pinned or GPU_Global")
         s += "}\n"
         callsite_stream.write(s)
 
diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py
index cd37612185..c526d0e8a7 100644
--- a/dace/sdfg/validation.py
+++ b/dace/sdfg/validation.py
@@ -359,9 +359,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context
                             f"Deferred arrays can't be returned. {desc} has __return in its name."
                             , sdfg, None
                         )
-                    if desc.storage is not dtypes.StorageType.GPU_Global and desc.storage is not dtypes.StorageType.CPU_Heap:
+                    if desc.storage not in dtypes.REALLOCATABLE_STORAGES:
                         raise InvalidSDFGError(
-                            f"Deferred arrays are supported only for {dtypes.StorageType.GPU_Global} and {dtypes.StorageType.CPU_Heap} storage types for {desc}."
+                            f"Deferred arrays are supported only for {dtypes.REALLOCATABLE_STORAGES} storage types for {desc}."
                             , sdfg, None
                         )
 
@@ -616,10 +616,10 @@ def validate_state(state: 'dace.sdfg.SDFGState',
             # Reading-Writing the size is valid only if the array is transient and has the storage type CPU_Heap or GPU_Global
             has_writes = len(write_size_edges) > 0
             has_writes_or_reads = len(read_size_edges) + len(write_size_edges) > 0
-            size_access_allowed = arr.transient and (arr.storage == dtypes.StorageType.CPU_Heap or arr.storage == dtypes.StorageType.GPU_Global)
+            size_access_allowed = arr.transient and (arr.storage in dtypes.REALLOCATABLE_STORAGES)
             if has_writes_or_reads and not size_access_allowed:
                 raise InvalidSDFGNodeError('Reading the size of an array, or changing (writing to) the size of an array '
-                                           'is only valid if the array is transient and the storage is CPU_Heap or GPU_Global', sdfg, state_id, nid)
+                                           f'is only valid if the array is transient and the storage is in {dtypes.REALLOCATABLE_STORAGES}', sdfg, state_id, nid)
             if has_writes and scope[node] is not None:
                 raise InvalidSDFGNodeError('Resizing array is not allowed within a scope (e.g. not inside maps)', sdfg, state_id, nid)
 
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index eee4482ae5..1d9df3a200 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -5,7 +5,7 @@
 import pytest
 
 
-@pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global])
+@pytest.fixture(params=[dace.dtypes.StorageType.CPU_Heap, dace.dtypes.StorageType.GPU_Global, dace.dtypes.StorageType.CPU_Pinned])
 def storage_type(request):
     return request.param
 
@@ -19,6 +19,8 @@ def schedule_type(storage_type):
         return dace.dtypes.ScheduleType.Sequential
     elif storage_type == dace.dtypes.StorageType.GPU_Global:
         return dace.dtypes.ScheduleType.GPU_Device
+    elif storage_type == dace.dtypes.StorageType.CPU_Pinned:
+        return dace.dtypes.ScheduleType.Sequential
 
 def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"):
     sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_1")
@@ -77,6 +79,8 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool,
         assert (schedule_type == dace.dtypes.ScheduleType.Sequential or schedule_type == dace.dtypes.ScheduleType.CPU_Multicore)
     elif storage_type == dace.dtypes.StorageType.GPU_Global:
         assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device)
+    elif storage_type == dace.dtypes.StorageType.CPU_Pinned:
+        assert (schedule_type == dace.dtypes.ScheduleType.Sequential)
 
     an_3.add_out_connector('_read_size')
     map_entry2, map_exit2 = state.add_map(name="map2",ndrange={"i":dace.subsets.Range([(0,15-1,1)]),"j":dace.subsets.Range([(0,"__A_dim1_size-1", 1)])},
@@ -101,7 +105,7 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool,
     return sdfg
 
 def _valid_to_reallocate(transient, storage_type):
-    return transient and (storage_type == dace.dtypes.StorageType.GPU_Global or storage_type == dace.dtypes.StorageType.CPU_Heap)
+    return transient and (storage_type in dace.dtypes.REALLOCATABLE_STORAGES)
 
 def _test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool):
     sdfg = _get_trivial_alloc_sdfg(storage_type, transient)
@@ -138,12 +142,12 @@ def _test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sc
         raise AssertionError("Realloc-use with non-transient data did not fail when it was expected to.")
 
     compiled_sdfg = sdfg.compile()
-    if storage_type == dace.dtypes.StorageType.CPU_Heap:
+    if storage_type == dace.dtypes.StorageType.CPU_Heap or storage_type == dace.dtypes.StorageType.CPU_Pinned:
         arr = numpy.array([-1.0]).astype(numpy.float32)
         user_size = numpy.array([10, 10]).astype(numpy.uint64)
         compiled_sdfg(user_size=user_size, example_array=arr)
         assert ( arr[0] == 3.0 )
-    if storage_type == dace.dtypes.StorageType.GPU_Global:
+    elif storage_type == dace.dtypes.StorageType.GPU_Global:
         try:
             import cupy
         except Exception:
@@ -158,12 +162,12 @@ def _test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sc
     sdfg.apply_transformations_repeated([StateFusion, RedundantArray, RedundantSecondArray])
     sdfg.validate()
     compiled_sdfg = sdfg.compile()
-    if storage_type == dace.dtypes.StorageType.CPU_Heap:
+    if storage_type == dace.dtypes.StorageType.CPU_Heap or storage_type == dace.dtypes.StorageType.CPU_Pinned:
         arr = numpy.array([-1.0]).astype(numpy.float32)
         user_size = numpy.array([10, 10]).astype(numpy.uint64)
         compiled_sdfg(user_size=user_size, example_array=arr)
         assert ( arr[0] == 3.0 )
-    if storage_type == dace.dtypes.StorageType.GPU_Global:
+    elif storage_type == dace.dtypes.StorageType.GPU_Global:
         try:
             import cupy
         except Exception:
@@ -181,10 +185,18 @@ def test_realloc_use_gpu(transient: bool):
 def test_realloc_use_cpu(transient: bool):
     _test_realloc_use(dace.dtypes.StorageType.CPU_Heap, transient, dace.dtypes.ScheduleType.Sequential)
 
+@pytest.mark.gpu
+def test_realloc_use_cpu_pinned(transient: bool):
+    _test_realloc_use(dace.dtypes.StorageType.CPU_Pinned, transient, dace.dtypes.ScheduleType.Sequential)
+
 @pytest.mark.gpu
 def test_trivial_realloc_gpu(transient: bool):
     _test_trivial_realloc(dace.dtypes.StorageType.GPU_Global, transient)
 
+@pytest.mark.gpu
+def test_trivial_realloc_cpu_pinned(transient: bool):
+    _test_trivial_realloc(dace.dtypes.StorageType.CPU_Pinned, transient)
+
 def test_trivial_realloc_cpu(transient: bool):
     _test_trivial_realloc(dace.dtypes.StorageType.CPU_Heap, transient)
 
@@ -220,6 +232,17 @@ def test_realloc_inside_map_gpu():
 
     pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
 
+def test_realloc_inside_map_cpu_pinned():
+    sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.CPU_Pinned, True, dace.dtypes.ScheduleType.Sequential)
+    _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.Sequential)
+    try:
+        sdfg.validate()
+    except Exception:
+        return
+
+    pytest.fail("Realloc-use with non-transient data and incomplete write did not fail when it was expected to.")
+
+
 def test_realloc_inside_map_cpu():
     sdfg =_get_assign_map_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore)
     _add_realloc_inside_map(sdfg, dace.dtypes.ScheduleType.CPU_Multicore)
@@ -275,6 +298,8 @@ def _get_conditional_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient
         assert (schedule_type == dace.dtypes.ScheduleType.Sequential or schedule_type == dace.dtypes.ScheduleType.CPU_Multicore)
     elif storage_type == dace.dtypes.StorageType.GPU_Global:
         assert (schedule_type == dace.dtypes.ScheduleType.GPU_Device)
+    elif storage_type == dace.dtypes.StorageType.CPU_Pinned:
+        assert (schedule_type == dace.dtypes.ScheduleType.Sequential)
 
     an_3 = state.add_access('A')
     an_3.add_out_connector('_read_size')
@@ -321,6 +346,7 @@ def _get_conditional_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient
 
     return sdfg
 
+@pytest.mark.gpu
 def test_conditional_alloc_gpu():
     sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device)
     sdfg.validate()
@@ -335,6 +361,16 @@ def test_conditional_alloc_gpu():
     sdfg(path=1, size1=size1, size2=size2, example_array=arr)
     assert ( arr.get()[0] == 3.0 )
 
+@pytest.mark.gpu
+def test_conditional_alloc_cpu_pinned():
+    sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Pinned, True, dace.dtypes.ScheduleType.Sequential)
+    sdfg.validate()
+    size1 = numpy.array([1, 1]).astype(numpy.uint64)
+    size2 = numpy.array([22, 22]).astype(numpy.uint64)
+    arr = numpy.array([-1.0]).astype(numpy.float32)
+    sdfg(path=1, size1=size1, size2=size2, example_array=arr)
+    assert ( arr.get()[0] == 3.0 )
+
 def test_conditional_alloc_cpu():
     sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore)
     sdfg.validate()
@@ -344,6 +380,7 @@ def test_conditional_alloc_cpu():
     sdfg(path=0, size1=size1, size2=size2, example_array=arr)
     assert ( arr[0] == 3.0 )
 
+@pytest.mark.gpu
 def test_conditional_alloc_with_expr_gpu():
     sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device, True)
     sdfg.validate()
@@ -358,6 +395,16 @@ def test_conditional_alloc_with_expr_gpu():
     sdfg(path=1, size1=size1, size2=size2, example_array=arr)
     assert ( arr.get()[0] == 3.0 )
 
+@pytest.mark.gpu
+def test_conditional_alloc_with_expr_cpu_pinned():
+    sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Pinned, True, dace.dtypes.ScheduleType.Sequential, True)
+    sdfg.validate()
+    size1 = numpy.array([1, 1]).astype(numpy.uint64)
+    size2 = numpy.array([22, 22]).astype(numpy.uint64)
+    arr = numpy.array([-1.0]).astype(numpy.float32)
+    sdfg(path=1, size1=size1, size2=size2, example_array=arr)
+    assert ( arr.get()[0] == 3.0 )
+
 def test_conditional_alloc_with_expr_cpu():
     sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore, True)
     sdfg.validate()
@@ -391,24 +438,36 @@ def test_incomplete_write_dimensions_2():
     test_realloc_inside_map_cpu()
     print(f"Trivial Realloc within map, gpu")
     test_realloc_inside_map_gpu()
+    print(f"Trivial Realloc within map, cpu pinned")
+    test_realloc_inside_map_cpu_pinned()
 
     print(f"Trivial Realloc with storage, cpu")
     test_trivial_realloc_cpu(True)
     print(f"Trivial Realloc-Use with storage, cpu")
     test_realloc_use_cpu(True)
+    print(f"Trivial Realloc within map, cpu pinned")
+    test_realloc_use_cpu_pinned(True)
 
     print(f"Trivial Realloc with storage, gpu")
     test_trivial_realloc_gpu(True)
     print(f"Trivial Realloc-Use with storage, gpu")
     test_realloc_use_gpu(True)
+    print(f"Trivial Realloc-Use with storage, cpu pinned")
+    test_realloc_use_cpu_pinned(True)
+
     print(f"Trivial Realloc with storage, cpu, on non-transient data")
     test_trivial_realloc_cpu(False)
     print(f"Trivial Realloc-Use with storage, cpu, on non-transient data")
     test_realloc_use_cpu(False)
     print(f"Trivial Realloc with storage, gpu, on non-transient data")
     test_trivial_realloc_gpu(False)
+
     print(f"Trivial Realloc-Use with storage, gpu, on non-transient data")
     test_realloc_use_gpu(False)
+    print(f"Trivial Realloc with storage, cpu pinned, on non-transient data")
+    test_trivial_realloc_cpu_pinned(False)
+    print(f"Trivial Realloc-Use with storage, cpu pinned, on non-transient data")
+    test_realloc_use_cpu_pinned(False)
 
     print(f"Realloc with incomplete write one, validation")
     test_incomplete_write_dimensions_1()
@@ -419,8 +478,12 @@ def test_incomplete_write_dimensions_2():
     test_conditional_alloc_cpu()
     print(f"Test conditional alloc with use, gpu")
     test_conditional_alloc_gpu()
+    print(f"Test conditional alloc with use, cpu pinned")
+    test_conditional_alloc_cpu_pinned()
 
     print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu")
     test_conditional_alloc_with_expr_cpu()
     print(f"Test conditional alloc with use and the shape as a non-trivial expression, gpu")
-    test_conditional_alloc_with_expr_gpu()
\ No newline at end of file
+    test_conditional_alloc_with_expr_gpu()
+    print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu pinned")
+    test_conditional_alloc_with_expr_cpu_pinned()
\ No newline at end of file

From bb04e1acb29c4e39b91132a0da6db92740d89f65 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 13 Dec 2024 16:26:15 +0100
Subject: [PATCH 42/51] Refactor and fix GPU array index generation

---
 dace/codegen/targets/cpp.py       |  1 +
 dace/codegen/targets/cuda.py      | 10 +++++++---
 dace/codegen/targets/framecode.py |  2 +-
 dace/data.py                      |  4 ++--
 dace/dtypes.py                    |  6 ++++++
 tests/deferred_alloc_test.py      | 32 ++++++++++++-------------------
 6 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py
index 8357ca1fa8..ed7c7bba49 100644
--- a/dace/codegen/targets/cpp.py
+++ b/dace/codegen/targets/cpp.py
@@ -631,6 +631,7 @@ def _get_realloc_dimensions(size_array_name:str, new_size_array_name:str, shape)
                 f"{size_array_name}[{i}] = {new_size_array_name}[{i}];"
             )
         else:
+            old_size_strs.append(sym2cpp(shape[i]))
             new_size_strs.append(sym2cpp(shape[i]))
     return size_assignment_strs, new_size_strs, old_size_strs
 
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 418cbbfdbd..6d77daa219 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1632,6 +1632,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
                     size_desc = sdfg.arrays[size_desc_name]
                     dyn_args_typed.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size")
                     needed_size_scalars_declaration.append(f"const {size_desc.dtype.ctype} __{arr_name}_dim{i}_size = {size_desc_name}[{i}];")
+        #raise Exception(needed_size_scalars_declaration, dyn_args)
 
         self._localcode.write(
             '__global__ void %s %s(%s) {\n' %
@@ -2065,6 +2066,9 @@ def generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: S
 
         # handle dynamic map inputs
         for e in dace.sdfg.dynamic_map_inputs(cfg.node(state_id), dfg_scope.source_nodes()[0]):
+            # If src is a _read_size, it was handled before
+            if e.src_conn is not None and e.src_conn == "_read_size":
+                continue
             kernel_stream.write(
                 self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]),
                 cfg, state_id,
@@ -2810,14 +2814,14 @@ def reallocate(
         tmp_storage_name = "__tmp_realloc_move_storage"
 
         callsite_stream.write(f"if ({dst_node.data} == nullptr) {{", cfg, state_id, dst_node.guid)
-        if data.storage  == dtypes.StorageType.GPU_Global:
-            assert data.storage == dtypes.StorageType.CPU_Pinned
+        if data.storage == dtypes.StorageType.GPU_Global:
             self._alloc_gpu_global(dst_node, data, callsite_stream, data_name, new_size_str)
         else:
+            assert data.storage == dtypes.StorageType.CPU_Pinned
             callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast<void**>(&{data_name}), {new_size_str}));", cfg, state_id, dst_node.guid)
         callsite_stream.write("} else {\n", cfg, state_id, dst_node.guid)
         callsite_stream.write(f"{dtype}* {tmp_storage_name};")
-        if data.storage  == dtypes.StorageType.GPU_Global:
+        if data.storage == dtypes.StorageType.GPU_Global:
             self._alloc_gpu_global(None, data, callsite_stream, tmp_storage_name, new_size_str)
         else:
             assert data.storage == dtypes.StorageType.CPU_Pinned
diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index 4e4f4bc372..47a7ab03a4 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -969,7 +969,7 @@ def generate_code(self,
             size_nodedesc = sdfg.arrays[size_desc_name]
             assert ("__return" not in size_desc_name)
             ctypedef = size_nodedesc.dtype.ctype
-            array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name]
+            array = [v for v in sdfg.arrays.values() if type(v) == data.Array and v.size_desc_name is not None and v.size_desc_name == size_desc_name]
             assert len(array) <= 1
             if len(array) == 1:
                 array = array[0]
diff --git a/dace/data.py b/dace/data.py
index a3b008f150..509da50cf6 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -183,7 +183,7 @@ def _transient_setter(self, value):
                             default=dtypes.AllocationLifetime.Scope)
     location = DictProperty(key_type=str, value_type=str, desc='Full storage location identifier (e.g., rank, GPU ID)')
     debuginfo = DebugInfoProperty(allow_none=True)
-    size_desc_name = Property(dtype=str, default=None, allow_none=True)
+
 
     def __init__(self, dtype, shape, transient, storage, location, lifetime, debuginfo):
         self.dtype = dtype
@@ -193,7 +193,6 @@ def __init__(self, dtype, shape, transient, storage, location, lifetime, debugin
         self.location = location if location is not None else {}
         self.lifetime = lifetime
         self.debuginfo = debuginfo
-        self.size_desc_name = None
         self._validate()
 
     def __call__(self):
@@ -1387,6 +1386,7 @@ class Array(Data):
                         'it is inferred by other properties and the OptionalArrayInference pass.')
     pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred')
 
+    size_desc_name = Property(dtype=str, default=None, allow_none=True, desc="Name of the size desc, not None only for reallocatable storage types that are also transient")
     is_size_array = Property(dtype=bool, default=False, desc='Special array that is used to track the size of an another array')
     is_deferred_array = Property(dtype=bool, default=False, desc='Array that requires deferred allocation')
 
diff --git a/dace/dtypes.py b/dace/dtypes.py
index 465e73b2b1..d09bfd4210 100644
--- a/dace/dtypes.py
+++ b/dace/dtypes.py
@@ -103,6 +103,12 @@ class ScheduleType(aenum.AutoNumberEnum):
     StorageType.FPGA_ShiftRegister,
 ]
 
+REALLOCATABLE_STORAGES = [
+    StorageType.CPU_Heap,
+    StorageType.CPU_Pinned,
+    StorageType.GPU_Global,
+]
+
 
 @undefined_safe_enum
 class ReductionType(aenum.AutoNumberEnum):
diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 1d9df3a200..73ddb7d40a 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -369,7 +369,7 @@ def test_conditional_alloc_cpu_pinned():
     size2 = numpy.array([22, 22]).astype(numpy.uint64)
     arr = numpy.array([-1.0]).astype(numpy.float32)
     sdfg(path=1, size1=size1, size2=size2, example_array=arr)
-    assert ( arr.get()[0] == 3.0 )
+    assert ( arr[0] == 3.0 )
 
 def test_conditional_alloc_cpu():
     sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore)
@@ -403,7 +403,7 @@ def test_conditional_alloc_with_expr_cpu_pinned():
     size2 = numpy.array([22, 22]).astype(numpy.uint64)
     arr = numpy.array([-1.0]).astype(numpy.float32)
     sdfg(path=1, size1=size1, size2=size2, example_array=arr)
-    assert ( arr.get()[0] == 3.0 )
+    assert ( arr[0] == 3.0 )
 
 def test_conditional_alloc_with_expr_cpu():
     sdfg =_get_conditional_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.CPU_Multicore, True)
@@ -443,32 +443,24 @@ def test_incomplete_write_dimensions_2():
 
     print(f"Trivial Realloc with storage, cpu")
     test_trivial_realloc_cpu(True)
-    print(f"Trivial Realloc-Use with storage, cpu")
-    test_realloc_use_cpu(True)
-    print(f"Trivial Realloc within map, cpu pinned")
-    test_realloc_use_cpu_pinned(True)
-
     print(f"Trivial Realloc with storage, gpu")
     test_trivial_realloc_gpu(True)
-    print(f"Trivial Realloc-Use with storage, gpu")
-    test_realloc_use_gpu(True)
-    print(f"Trivial Realloc-Use with storage, cpu pinned")
-    test_realloc_use_cpu_pinned(True)
+    print(f"Trivial Realloc with storage, cpu pinned")
+    test_trivial_realloc_cpu_pinned(True)
 
     print(f"Trivial Realloc with storage, cpu, on non-transient data")
     test_trivial_realloc_cpu(False)
-    print(f"Trivial Realloc-Use with storage, cpu, on non-transient data")
-    test_realloc_use_cpu(False)
-    print(f"Trivial Realloc with storage, gpu, on non-transient data")
-    test_trivial_realloc_gpu(False)
-
     print(f"Trivial Realloc-Use with storage, gpu, on non-transient data")
-    test_realloc_use_gpu(False)
+    test_trivial_realloc_gpu(False)
     print(f"Trivial Realloc with storage, cpu pinned, on non-transient data")
     test_trivial_realloc_cpu_pinned(False)
-    print(f"Trivial Realloc-Use with storage, cpu pinned, on non-transient data")
-    test_realloc_use_cpu_pinned(False)
 
+    print(f"Trivial Realloc-Use with storage, cpu")
+    test_realloc_use_cpu(True)
+    print(f"Trivial Realloc-Use with storage, gpu")
+    test_realloc_use_gpu(True)
+    print(f"Trivial Realloc-Use with storage, cpu pinned")
+    test_realloc_use_cpu_pinned(True)
     print(f"Realloc with incomplete write one, validation")
     test_incomplete_write_dimensions_1()
     print(f"Realloc with incomplete write two, validation")
@@ -486,4 +478,4 @@ def test_incomplete_write_dimensions_2():
     print(f"Test conditional alloc with use and the shape as a non-trivial expression, gpu")
     test_conditional_alloc_with_expr_gpu()
     print(f"Test conditional alloc with use and the shape as a non-trivial expression, cpu pinned")
-    test_conditional_alloc_with_expr_cpu_pinned()
\ No newline at end of file
+    test_conditional_alloc_with_expr_cpu_pinned()

From 02a48e8227b2a28418007068bc9e439de0370ed5 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 13 Dec 2024 17:04:06 +0100
Subject: [PATCH 43/51] Fixes to size desc name checks

---
 dace/codegen/targets/cpu.py                   |  9 ++++---
 dace/codegen/targets/cuda.py                  | 27 ++++++++++---------
 dace/sdfg/sdfg.py                             | 16 ++++++-----
 .../passes/array_elimination.py               |  2 +-
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index af7bb0502b..2d1c33df44 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -341,7 +341,7 @@ def declare_array(self,
             self._dispatcher.declared_arrays.add(name, DefinedType.Pointer, ctypedef)
 
             # Size desc is defined only for transient arrays
-            if nodedesc.transient and nodedesc.storage == dtypes.StorageType.CPU_Heap:
+            if nodedesc.transient and nodedesc.storage == dtypes.StorageType.CPU_Heap and type(nodedesc) == data.Array:
                 size_desc_name = sdfg.arrays[name].size_desc_name
                 if size_desc_name is not None:
                     size_desc = sdfg.arrays[size_desc_name]
@@ -698,13 +698,14 @@ def reallocate(
         data_name = dst_node.data
         new_size_array_name = src_node.data
 
-        data = sdfg.arrays[data_name]
-        size_array_name = data.size_desc_name
+        desc = sdfg.arrays[data_name]
+        assert type(data) == dt.Array
+        size_array_name = desc.size_desc_name
 
         dtype = sdfg.arrays[data_name].dtype
 
         size_assignment_strs, new_size_strs, _ = cpp._get_realloc_dimensions(
-            size_array_name, new_size_array_name, data.shape
+            size_array_name, new_size_array_name, desc.shape
         )
 
         for size_assignment in size_assignment_strs:
diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 6d77daa219..abebe1119d 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1582,7 +1582,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
 
             if aname in sdfg.arrays:
                 arr = sdfg.arrays[aname]
-                if arr.transient and arr.storage == dtypes.StorageType.GPU_Global and arr.size_desc_name is not None:
+                if arr.transient and arr.storage == dtypes.StorageType.GPU_Global and type(arr) == dt.Array and arr.size_desc_name is not None:
                     size_arr_name = data_desc.size_desc_name
                     if size_arr_name is not None:
                         size_arr = sdfg.arrays[size_arr_name]
@@ -2795,13 +2795,14 @@ def reallocate(
         data_name = dst_node.data
         new_size_array_name = src_node.data
 
-        data = sdfg.arrays[data_name]
-        size_array_name = data.size_desc_name
+        desc = sdfg.arrays[data_name]
+        assert type(desc) == dt.Array
+        size_array_name = desc.size_desc_name
 
         dtype = sdfg.arrays[data_name].dtype
 
         size_assignment_strs, new_size_strs, old_size_strs = cpp._get_realloc_dimensions(
-            size_array_name, new_size_array_name, data.shape
+            size_array_name, new_size_array_name, desc.shape
         )
 
 
@@ -2814,23 +2815,23 @@ def reallocate(
         tmp_storage_name = "__tmp_realloc_move_storage"
 
         callsite_stream.write(f"if ({dst_node.data} == nullptr) {{", cfg, state_id, dst_node.guid)
-        if data.storage == dtypes.StorageType.GPU_Global:
-            self._alloc_gpu_global(dst_node, data, callsite_stream, data_name, new_size_str)
+        if desc.storage == dtypes.StorageType.GPU_Global:
+            self._alloc_gpu_global(dst_node, desc, callsite_stream, data_name, new_size_str)
         else:
-            assert data.storage == dtypes.StorageType.CPU_Pinned
+            assert desc.storage == dtypes.StorageType.CPU_Pinned
             callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast<void**>(&{data_name}), {new_size_str}));", cfg, state_id, dst_node.guid)
         callsite_stream.write("} else {\n", cfg, state_id, dst_node.guid)
         callsite_stream.write(f"{dtype}* {tmp_storage_name};")
-        if data.storage == dtypes.StorageType.GPU_Global:
-            self._alloc_gpu_global(None, data, callsite_stream, tmp_storage_name, new_size_str)
+        if desc.storage == dtypes.StorageType.GPU_Global:
+            self._alloc_gpu_global(None, desc, callsite_stream, tmp_storage_name, new_size_str)
         else:
-            assert data.storage == dtypes.StorageType.CPU_Pinned
+            assert desc.storage == dtypes.StorageType.CPU_Pinned
             callsite_stream.write(f"DACE_GPU_CHECK({self.backend}MallocHost(reinterpret_cast<void**>(&{tmp_storage_name}), {new_size_str}));", cfg, state_id, dst_node.guid)
 
         s = ""
         copy_size_str = f"Min({old_size_str}, {new_size_str})"
-        if data.storage == dtypes.StorageType.GPU_Global:
-            if not data.pool:  # If pooled, will be freed somewhere else
+        if desc.storage == dtypes.StorageType.GPU_Global:
+            if not desc.pool:  # If pooled, will be freed somewhere else
                 s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, cudaMemcpyDeviceToDevice));\n"
                 s += f"DACE_GPU_CHECK({self.backend}Free({data_name}));\n"
                 s += f"{data_name} = {tmp_storage_name};\n"
@@ -2845,7 +2846,7 @@ def reallocate(
                 self._emit_sync(callsite_stream)
                 callsite_stream.write(f"{data_name} = {tmp_storage_name};\n")
                 s = ""
-        elif data.storage == dtypes.StorageType.CPU_Pinned:
+        elif desc.storage == dtypes.StorageType.CPU_Pinned:
             s += f"DACE_GPU_CHECK({self.backend}Memcpy(static_cast<void *>({tmp_storage_name}), static_cast<void *>({data_name}), {copy_size_str}, cudaMemcpyHostToHost));\n"
             s += f"DACE_GPU_CHECK({self.backend}FreeHost({data_name}));\n"
             s += f"{data_name} = {tmp_storage_name};\n"
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 58b9dfd854..e1d4962a02 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -774,11 +774,10 @@ def replace_dict(self,
         size_ararys_to_rm = set()
         for arr_name, size_desc_name in size_desc_map.items():
             arr = self.arrays[arr_name] if arr_name in self.arrays else None
-            if arr is not None:
+            if arr is not None and type(arr) == dt.Array:
                 size_desc_name_before = arr.size_desc_name
                 # If we change the name of an array, then we need to change its size array accordingly
-                if (arr.transient and type(arr) == dt.Array and size_desc_name_before is not None
-                    and size_desc_name is not None):
+                if (arr.transient and size_desc_name_before is not None):
                     arr.size_desc_name = size_desc_name
                     assert (arr.size_desc_name == size_desc_name)
                     self.arrays[size_desc_name]  = self.arrays.pop(size_desc_name_before)
@@ -1199,10 +1198,13 @@ def remove_data(self, name, validate=True):
                                          f"{name}: it is accessed by node "
                                          f"{node} in state {state}.")
 
-        size_desc_name = self._arrays[name].size_desc_name
-        # If unused it might have been removed by optimization
-        if size_desc_name is not None and size_desc_name in self._arrays:
-            del self._arrays[size_desc_name]
+        # Check for size desc
+        if type(self._arrays[name]) == dt.Array:
+            size_desc_name = self._arrays[name].size_desc_name
+            # If unused it might have been removed by optimization
+            if size_desc_name is not None and size_desc_name in self._arrays:
+                del self._arrays[size_desc_name]
+
         del self._arrays[name]
 
 
diff --git a/dace/transformation/passes/array_elimination.py b/dace/transformation/passes/array_elimination.py
index f7cf0ac64d..6a28877aa4 100644
--- a/dace/transformation/passes/array_elimination.py
+++ b/dace/transformation/passes/array_elimination.py
@@ -85,7 +85,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[S
 
         # If node is completely removed from graph, erase data descriptor
         array_items = list(sdfg.arrays.items())
-        size_descriptors = set([v.size_desc_name for v in sdfg.arrays.values() if v.size_desc_name is not None])
+        size_descriptors = set([v.size_desc_name for v in sdfg.arrays.values() if type(v) == data.Array and v.size_desc_name is not None])
         for aname, desc in array_items:
             # Remove size descriptors only if the original array is removed
             if  aname in size_descriptors:

From da7ba8da1b2aa7f5d7ea9f34b1d1e026bb7c3b2e Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 13 Dec 2024 17:42:41 +0100
Subject: [PATCH 44/51] Fix to erronous assertion

---
 dace/codegen/targets/cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py
index 2d1c33df44..68c85966b7 100644
--- a/dace/codegen/targets/cpu.py
+++ b/dace/codegen/targets/cpu.py
@@ -699,7 +699,7 @@ def reallocate(
         new_size_array_name = src_node.data
 
         desc = sdfg.arrays[data_name]
-        assert type(data) == dt.Array
+        assert type(desc) == data.Array
         size_array_name = desc.size_desc_name
 
         dtype = sdfg.arrays[data_name].dtype

From 460b75b064fcb0aa14a4414b4eb69f495a61a529 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Fri, 13 Dec 2024 17:49:58 +0100
Subject: [PATCH 45/51] Test script refactor

---
 tests/deferred_alloc_test.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py
index 73ddb7d40a..c1e6f7a2da 100644
--- a/tests/deferred_alloc_test.py
+++ b/tests/deferred_alloc_test.py
@@ -455,12 +455,20 @@ def test_incomplete_write_dimensions_2():
     print(f"Trivial Realloc with storage, cpu pinned, on non-transient data")
     test_trivial_realloc_cpu_pinned(False)
 
-    print(f"Trivial Realloc-Use with storage, cpu")
+    print(f"Trivial Realloc-Use with storage, cpu, transient")
     test_realloc_use_cpu(True)
-    print(f"Trivial Realloc-Use with storage, gpu")
+    print(f"Trivial Realloc-Use with storage, gpu, transient")
     test_realloc_use_gpu(True)
-    print(f"Trivial Realloc-Use with storage, cpu pinned")
+    print(f"Trivial Realloc-Use with storage, cpu pinned, transient")
     test_realloc_use_cpu_pinned(True)
+
+    print(f"Trivial Realloc-Use with storage, cpu, non-transient")
+    test_realloc_use_cpu(False)
+    print(f"Trivial Realloc-Use with storage, gpu, non-transient")
+    test_realloc_use_gpu(False)
+    print(f"Trivial Realloc-Use with storage, cpu pinned, non-transient")
+    test_realloc_use_cpu_pinned(False)
+
     print(f"Realloc with incomplete write one, validation")
     test_incomplete_write_dimensions_1()
     print(f"Realloc with incomplete write two, validation")

From e0472dc14f930de33a07ae6f8defc3fac120ebc4 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Mon, 16 Dec 2024 11:14:46 +0100
Subject: [PATCH 46/51] merge fix

---
 dace/codegen/targets/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index abebe1119d..236621b252 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -1636,7 +1636,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: StateSub
 
         self._localcode.write(
             '__global__ void %s %s(%s) {\n' %
-            (launch_bounds, kernel_name, ', '.join(kernel_args_typed + dyn_args_typed + extra_kernel_args_typed)), sdfg, state_id, node)
+            (launch_bounds, kernel_name, ', '.join(kernel_args_typed + dyn_args_typed + extra_kernel_args_typed)), cfg, state_id, node)
 
         # Write constant expressions in GPU code
         self._frame.generate_constants(sdfg, self._localcode)

From 92717e1c111e76e7fc218ede6d686e194acaa06b Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Mon, 16 Dec 2024 20:42:16 +0100
Subject: [PATCH 47/51] Allocate array fix

---
 dace/codegen/targets/cuda.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index 236621b252..db902f216b 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -612,7 +612,9 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
         deferred_allocation = any([s for s in nodedesc.shape if "__dace_defer" in str(s)])
 
         # Different types of GPU arrays)
-        if nodedesc.storage in dtypes.REALLOCATABLE_STORAGES:
+        if deferred_allocation:
+            assert nodedesc.storage in dtypes.REALLOCATABLE_STORAGES
+
             if not declared:
                 declaration_stream.write('%s %s;\n' % (ctypedef, dataname))
             self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
@@ -622,8 +624,11 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
                     "%s = nullptr; // Deferred Allocation" %
                     (dataname,)
                 )
-            else:
-                self._alloc_gpu_global(node, nodedesc, result_alloc, dataname, arrsize_malloc)
+
+        elif  nodedesc.storage == dtypes.StorageType.GPU_Global:
+            if not declared:
+                declaration_stream.write('%s %s;\n' % (ctypedef, dataname))
+            self._alloc_gpu_global(node, nodedesc, result_alloc, dataname, arrsize_malloc)
 
             if node.setzero:
                 if deferred_allocation:

From 592336bf9298809957b4b43353c531452abcd46c Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Mon, 16 Dec 2024 20:50:57 +0100
Subject: [PATCH 48/51] Add forgotten defined var add

---
 dace/codegen/targets/cuda.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py
index db902f216b..60f5ccb64f 100644
--- a/dace/codegen/targets/cuda.py
+++ b/dace/codegen/targets/cuda.py
@@ -629,6 +629,8 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV
             if not declared:
                 declaration_stream.write('%s %s;\n' % (ctypedef, dataname))
             self._alloc_gpu_global(node, nodedesc, result_alloc, dataname, arrsize_malloc)
+            self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, ctypedef)
+
 
             if node.setzero:
                 if deferred_allocation:

From 02937e3651086bad7733b270701c6d6b38c581d9 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Thu, 19 Dec 2024 22:12:08 +0100
Subject: [PATCH 49/51] Make size arary alloc C99 std compliant instead of
 C++11

---
 dace/codegen/targets/framecode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py
index f389a0a026..4d56883c58 100644
--- a/dace/codegen/targets/framecode.py
+++ b/dace/codegen/targets/framecode.py
@@ -982,7 +982,7 @@ def generate_code(self,
                     dimensions = ["0" if "__dace_defer" in cpp.sym2cpp(dim) else cpp.sym2cpp(dim) for dim in array.shape]
                     size_str = ",".join(dimensions)
                     assert len(size_nodedesc.shape) == 1
-                    alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n'
+                    alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}] = {{{size_str}}};\n'
                     callsite_stream.write(alloc_str)
                     self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef, allow_shadowing=True)
 

From b9698fad89fca5476fad6bf6ec8010e47c33bca0 Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Sat, 21 Dec 2024 14:45:48 +0100
Subject: [PATCH 50/51] Allow reshaping by changing size desc shape

---
 dace/data.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index 509da50cf6..1d62c45f0b 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -354,6 +354,13 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
         new_desc.storage = storage
         return new_desc
 
+    @property
+    def shape(self):
+        return self._shape
+
+    @shape.setter
+    def shape(self, value):
+        self._shape = value
 
 def _arrays_to_json(arrays):
     if arrays is None:
@@ -1583,7 +1590,7 @@ def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
     def free_symbols(self):
         return self.used_symbols(all_symbols=True)
 
-    def _set_shape_dependent_properties(self, shape, strides, total_size, offset):
+    def _set_shape_dependent_properties(self, shape, strides, total_size, offset, sdfg):
         """
         Used to set properties which depend on the shape of the array
         either to their default value, which depends on the shape, or
@@ -1608,20 +1615,34 @@ def _set_shape_dependent_properties(self, shape, strides, total_size, offset):
         else:
             self.offset = [0] * len(shape)
 
+        if self.is_deferred_array and sdfg is not None:
+            size_desc = sdfg.arrays[self.size_desc_name]
+            size_desc.shape = (len(shape), )
+
     def set_shape(
         self,
         new_shape,
         strides=None,
         total_size=None,
         offset=None,
+        sdfg=None,
     ):
         """
         Updates the shape of an array.
         """
         self.shape = new_shape
-        self._set_shape_dependent_properties(new_shape, strides, total_size, offset)
+        self._set_shape_dependent_properties(new_shape, strides, total_size, offset, sdfg)
         self.validate()
 
+    @property
+    def shape(self):
+        return self._shape
+
+    @shape.setter
+    def shape(self, value):
+        self._shape = value
+
+
 
 @make_properties
 class Stream(Data):

From 9755810a3d4f6a23cbc563902144056ff4a78d1c Mon Sep 17 00:00:00 2001
From: Yakup Budanaz <budanaz.yakup@gmail.com>
Date: Sat, 21 Dec 2024 15:48:53 +0100
Subject: [PATCH 51/51] Rm getters, move funcitonality to set shape only

---
 dace/data.py | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/dace/data.py b/dace/data.py
index 1d62c45f0b..4082b97fd4 100644
--- a/dace/data.py
+++ b/dace/data.py
@@ -354,13 +354,6 @@ def add(X: dace.float32[10, 10] @ dace.StorageType.GPU_Global):
         new_desc.storage = storage
         return new_desc
 
-    @property
-    def shape(self):
-        return self._shape
-
-    @shape.setter
-    def shape(self, value):
-        self._shape = value
 
 def _arrays_to_json(arrays):
     if arrays is None:
@@ -1590,7 +1583,7 @@ def used_symbols(self, all_symbols: bool) -> Set[symbolic.SymbolicType]:
     def free_symbols(self):
         return self.used_symbols(all_symbols=True)
 
-    def _set_shape_dependent_properties(self, shape, strides, total_size, offset, sdfg):
+    def _set_shape_dependent_properties(self, shape, strides, total_size, offset, sdfg=None):
         """
         Used to set properties which depend on the shape of the array
         either to their default value, which depends on the shape, or
@@ -1616,8 +1609,7 @@ def _set_shape_dependent_properties(self, shape, strides, total_size, offset, sd
             self.offset = [0] * len(shape)
 
         if self.is_deferred_array and sdfg is not None:
-            size_desc = sdfg.arrays[self.size_desc_name]
-            size_desc.shape = (len(shape), )
+            sdfg.arrays[self.size_desc_name].set_shape(new_shape=(len(shape),))
 
     def set_shape(
         self,
@@ -1631,18 +1623,15 @@ def set_shape(
         Updates the shape of an array.
         """
         self.shape = new_shape
-        self._set_shape_dependent_properties(new_shape, strides, total_size, offset, sdfg)
+        self._set_shape_dependent_properties(
+            shape=new_shape,
+            strides=strides,
+            total_size=total_size,
+            offset=offset,
+            sdfg=sdfg
+        )
         self.validate()
 
-    @property
-    def shape(self):
-        return self._shape
-
-    @shape.setter
-    def shape(self, value):
-        self._shape = value
-
-
 
 @make_properties
 class Stream(Data):