From edee4e6386551672450cd7f337b8f5d43ebdbb34 Mon Sep 17 00:00:00 2001 From: Phillip Allen Lane Date: Thu, 27 Jul 2023 10:04:11 -0700 Subject: [PATCH 1/3] add wmma::fragment to defined vars --- samples/codegen/tensor_cores.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py index 52d906254b..d799d33002 100644 --- a/samples/codegen/tensor_cores.py +++ b/samples/codegen/tensor_cores.py @@ -27,6 +27,7 @@ from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import StateSubgraphView from dace.codegen.prettycode import CodeIOStream +from dace.codegen.dispatcher import DefinedType from typing import Any, List # Other imports @@ -85,14 +86,14 @@ def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, # Write a fragment based on the storage type if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: - declaration_stream.write('wmma::fragment {};'.format(name), sdfg, state_id, node) + ctype = 'wmma::fragment' + declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node) else: - declaration_stream.write( - 'wmma::fragment ' - '{name};'.format(mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj, name=name), sdfg, - state_id, node) + ctype = 'wmma::fragment'.format( + mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) + declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node) + + self._dispatcher.defined_vars.add(name, DefinedType.Stream, ctype) def deallocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Array, function_stream: CodeIOStream, callsite_stream: CodeIOStream): From eb3f039102821aaf2163ca0633fc7d8dfa486d1c Mon Sep 17 00:00:00 2001 From: Phillip Allen Lane Date: Thu, 27 Jul 2023 10:11:03 -0700 Subject: [PATCH 2/3] change frontend functions from using @replaces to utilizing dae.tasklet --- samples/codegen/tensor_cores.py | 70 ++++++++++++--------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py index d799d33002..2aa8d1f445 100644 --- a/samples/codegen/tensor_cores.py +++ b/samples/codegen/tensor_cores.py @@ -77,6 +77,9 @@ def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: dace.SDFG): def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Array, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + # Make sure the codegen includes the appropriate header files + _include_mma(sdfg) + name = node.data # Based on the hardware, the total size must be 16^2 @@ -188,50 +191,29 @@ def _include_mma(sdfg: dace.SDFG): sdfg.append_global_code(global_code, 'cuda') -@replaces('frag_fill') -def frag_fill(pv: ProgramVisitor, sdfg: dace.SDFG, state: dace.SDFGState, frag: str, fill: Any) -> List[str]: - # Replacement functions receive the SDFG and the current state as the first - # two arguments, followed by all the other arguments. Here we treat them as - # two strings representing the array name to fill and what to fill it with. - - # NOTE: If a slice is used in the `frag` argument, the Python frontend - # automatically creates a new array for it, and uses the correct string as - # the argument. - wnode = state.add_write(frag) - tasklet = state.add_tasklet('fill', - set(), {'out'}, - ''' - wmma::fill_fragment(out, %s);''' % fill, - language=dace.Language.CPP) - - state.add_edge(tasklet, 'out', wnode, None, dace.Memlet.from_array(frag, wnode.desc(sdfg))) - - _include_mma(sdfg) - - # Function has no return value - return [] - - -@replaces('wmma') -def wmma(pv: ProgramVisitor, sdfg: dace.SDFG, state: dace.SDFGState, a_frag: str, b_frag: str, - c_frag: str) -> List[str]: - # Implemented similarly to `frag_fill`, but with inputs and outputs. - anode = state.add_read(a_frag) - bnode = state.add_read(b_frag) - cnode = state.add_write(c_frag) - tasklet = state.add_tasklet('wmma', {'afrag', 'bfrag'}, {'cfrag'}, - ''' - wmma::mma_sync(cfrag, afrag, bfrag, cfrag);''', - language=dace.Language.CPP) - - state.add_edge(anode, None, tasklet, 'afrag', dace.Memlet.from_array(a_frag, anode.desc(sdfg))) - state.add_edge(bnode, None, tasklet, 'bfrag', dace.Memlet.from_array(b_frag, bnode.desc(sdfg))) - state.add_edge(tasklet, 'cfrag', cnode, None, dace.Memlet.from_array(c_frag, cnode.desc(sdfg))) - - _include_mma(sdfg) - - # Function has no return value - return [] +def frag_fill(frag, fill): + # Define a tasklet with the appropriate input and output connectors. + # Then we can directly emit CUDA for the tasklet. + with dace.tasklet(dace.Language.CPP): + val << fill + out >> frag + """ + wmma::fill_fragment(out, val); + """ + +def wmma(a_frag, b_frag, c_frag): + # We do the same here as we did with frag_fill. Since c_frag is used + # as both an input and an output, we specify two separate variables + # to be passed to mma_sync and declare c_frag as an input to one and + # an output to the other. This ensures proper dataflow. + with dace.tasklet(dace.Language.CPP): + afrag << a_frag + bfrag << b_frag + cfrag << c_frag + dfrag >> c_frag + """ + wmma::mma_sync(dfrag, afrag, bfrag, cfrag); + """ ############################################################################ From ed6572480b1c2bea419f7288088d30a65e0c17ae Mon Sep 17 00:00:00 2001 From: Phillip Allen Lane Date: Thu, 27 Jul 2023 10:21:36 -0700 Subject: [PATCH 3/3] add comment explaining the use of defined_vars --- samples/codegen/tensor_cores.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/samples/codegen/tensor_cores.py b/samples/codegen/tensor_cores.py index 2aa8d1f445..92ea28eacf 100644 --- a/samples/codegen/tensor_cores.py +++ b/samples/codegen/tensor_cores.py @@ -96,6 +96,8 @@ def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj) declaration_stream.write(f'{ctype} {name};', sdfg, state_id, node) + # Add the ctype to defined_vars so that the codegen can properly pass + # fragments to functions as an object reference. self._dispatcher.defined_vars.add(name, DefinedType.Stream, ctype) def deallocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode,