Skip to content

Commit

Permalink
Merge branch 'spcl:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
hodelcl authored Jan 3, 2024
2 parents 1991648 + 1393cb0 commit 351ed9a
Show file tree
Hide file tree
Showing 13 changed files with 978 additions and 89 deletions.
86 changes: 38 additions & 48 deletions dace/codegen/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
cmake_minimum_required(VERSION 3.15)
cmake_minimum_required(VERSION 3.17)
project(dace_program)

# General options
Expand All @@ -9,6 +9,9 @@ set(DACE_FILES "" CACHE STRING "List of host code files relative to the root of
set(DACE_LIBS "" CACHE STRING "Extra libraries")
set(HLSLIB_PART_NAME "${DACE_XILINX_PART_NAME}")

# CUDA
set(DACE_CUDA_ARCHITECTURES_DEFAULT "" CACHE STRING "Default CUDA architectures in case native not found")

# FPGA specific
set(DACE_FPGA_AUTOBUILD_BITSTREAM OFF CACHE STRING "Automatically build bitstreams if they are not present.")

Expand Down Expand Up @@ -60,7 +63,7 @@ foreach(DACE_FILE ${DACE_FILES})
set(DACE_HIP_FILES ${DACE_HIP_FILES} ${DACE_FILE})
else()
set(DACE_ENABLE_CUDA ON)
set(DACE_CUDA_FILES ${DACE_CUDA_FILES} ${DACE_FILE})
set(DACE_CPP_FILES ${DACE_CPP_FILES} ${DACE_FILE})
endif()
elseif(${DACE_FILE_TARGET} STREQUAL "xilinx")
set(DACE_ENABLE_XILINX ON)
Expand Down Expand Up @@ -103,24 +106,42 @@ include_directories(${DACE_RUNTIME_DIR}/include)
# Global DaCe external dependencies
find_package(Threads REQUIRED)
find_package(OpenMP REQUIRED COMPONENTS CXX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

list(APPEND DACE_LIBS Threads::Threads)
list(APPEND DACE_LIBS OpenMP::OpenMP_CXX)

add_definitions(-DDACE_BINARY_DIR=\"${CMAKE_BINARY_DIR}\")
set(DACE_LIBS ${DACE_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${OpenMP_CXX_LIBRARIES})

if(DACE_ENABLE_MPI)
find_package(MPI REQUIRED)
include_directories(${MPI_CXX_INCLUDE_PATH})
set(DACE_LIBS ${DACE_LIBS} ${MPI_CXX_LIBRARIES})
list(APPEND DACE_LIBS MPI::MPI_CXX)
endif()

if(DACE_ENABLE_CUDA)
find_package(CUDA REQUIRED)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
include_directories(${CUDA_INCLUDE_DIRS})
if (MSVC_IDE)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
else()
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
set(CUDAToolkit_ROOT ${CUDA_TOOLKIT_ROOT_DIR})

find_package(CUDAToolkit REQUIRED)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

# CMake 3.24: set_property(TARGET tgt PROPERTY CUDA_ARCHITECTURES native)
if (NOT DEFINED LOCAL_CUDA_ARCHITECTURES)
execute_process(COMMAND "${CUDAToolkit_NVCC_EXECUTABLE}" "--run"
"${CMAKE_SOURCE_DIR}/tools/get_cuda_arch.cpp"
OUTPUT_VARIABLE _local_arch RESULT_VARIABLE _arch_res)

if(_arch_res EQUAL 0)
set(LOCAL_CUDA_ARCHITECTURES "${_local_arch}" CACHE STRING "Detected local GPUs for compilation")
message(STATUS "Local CUDA architectures detected: ${LOCAL_CUDA_ARCHITECTURES}")
else()
set(LOCAL_CUDA_ARCHITECTURES "${DACE_CUDA_ARCHITECTURES_DEFAULT}" CACHE STRING "Detected local GPUs for compilation")
message(STATUS "No local CUDA-capable GPUs found. Using default: ${DACE_CUDA_ARCHITECTURES_DEFAULT}")
endif()
endif()
set(DACE_LIBS ${DACE_LIBS} ${CUDA_LIBRARIES})

set(CMAKE_CUDA_ARCHITECTURES "${LOCAL_CUDA_ARCHITECTURES}")
enable_language(CUDA)
list(APPEND DACE_LIBS CUDA::cudart)
add_definitions(-DWITH_CUDA)

if (MSVC_IDE)
Expand Down Expand Up @@ -242,38 +263,6 @@ if (DACE_ENABLE_RTL AND DACE_ENABLE_XILINX)
include ("${DACE_RTLLIB_DIR}/cmake/rtl_target.cmake")
endif()

# Create CUDA object files
if(DACE_ENABLE_CUDA)
# Get local CUDA architectures
if (NOT DEFINED LOCAL_CUDA_ARCHITECTURES)
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin" "${CMAKE_CXX_COMPILER}" "--run"
"${CMAKE_SOURCE_DIR}/tools/get_cuda_arch.cpp"
OUTPUT_VARIABLE _arch_out RESULT_VARIABLE _arch_res
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

if(_arch_res EQUAL 0)
string(REGEX REPLACE "\n" ";" _arch_out "${_arch_out}")
list(GET _arch_out -1 _local_arch)
string(REGEX REPLACE " " ";" _local_arch "${_local_arch}")
set(LOCAL_CUDA_ARCHITECTURES "${_local_arch}" CACHE STRING "Detected local GPUs for compilation")
message(STATUS "Local CUDA architectures detected: ${LOCAL_CUDA_ARCHITECTURES}")
else()
set(LOCAL_CUDA_ARCHITECTURES "" CACHE STRING "Detected local GPUs for compilation")
message(STATUS "No local CUDA-capable GPUs found")
endif()
endif()

# Add flags to compile for local CUDA architectures
foreach(var ${LOCAL_CUDA_ARCHITECTURES})
list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_${var},code=sm_${var})
endforeach()

cuda_include_directories(${DACE_RUNTIME_DIR}/include)
cuda_compile(DACE_CUDA_OBJECTS ${DACE_CUDA_FILES})
set(DACE_OBJECTS ${DACE_OBJECTS} ${DACE_CUDA_OBJECTS})
endif() # DACE_ENABLE_CUDA


# Create HIP object files
if(DACE_ENABLE_HIP)
# Get local AMD architectures
Expand Down Expand Up @@ -580,7 +569,7 @@ include("targets/mlir/mlir.cmake")

# Create DaCe library file
add_library(${DACE_PROGRAM_NAME} SHARED ${DACE_CPP_FILES} ${DACE_OBJECTS})
target_link_libraries(${DACE_PROGRAM_NAME} ${DACE_LIBS})
target_link_libraries(${DACE_PROGRAM_NAME} PUBLIC ${DACE_LIBS})

# Add additional required files
if(DACE_ENABLE_INTELFPGA)
Expand All @@ -599,6 +588,7 @@ if(DACE_ENABLE_INTELFPGA)
DEPENDS ${DACE_PROGRAM_NAME}_hardware.aocx)
endif()
endif()

if(DACE_ENABLE_XILINX)
if(DACE_XILINX_MODE STREQUAL "software_emulation" AND DACE_FPGA_AUTOBUILD_BITSTREAM)
add_custom_target(autobuild_bitstream ALL
Expand All @@ -619,7 +609,7 @@ endif()

# Create DaCe loader stub
add_library(dacestub_${DACE_PROGRAM_NAME} SHARED "${CMAKE_SOURCE_DIR}/tools/dacestub.cpp")
target_link_libraries(dacestub_${DACE_PROGRAM_NAME} ${CMAKE_THREAD_LIBS_INIT} ${OpenMP_CXX_LIBRARIES})
target_link_libraries(dacestub_${DACE_PROGRAM_NAME} Threads::Threads OpenMP::OpenMP_CXX ${CMAKE_DL_LIBS})

# Windows-specific fixes
if (MSVC_IDE)
Expand Down
3 changes: 3 additions & 0 deletions dace/codegen/targets/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,9 @@ def visit_Name(self, node: ast.Name):
except KeyError:
defined_type = None
if (self.allow_casts and isinstance(dtype, dtypes.pointer) and memlet.subset.num_elements() == 1):
# Special case for pointer to pointer assignment
if memlet.data in self.sdfg.arrays and self.sdfg.arrays[memlet.data].dtype == dtype:
return self.generic_visit(node)
return ast.parse(f"{name}[0]").body[0].value
elif (self.allow_casts and (defined_type in (DefinedType.Stream, DefinedType.StreamArray))
and memlet.dynamic):
Expand Down
39 changes: 25 additions & 14 deletions dace/codegen/targets/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,10 +717,8 @@ def _emit_copy(

state_dfg = sdfg.nodes()[state_id]

copy_shape, src_strides, dst_strides, src_expr, dst_expr = \
cpp.memlet_copy_to_absolute_strides(
self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node,
self._packed_types)
copy_shape, src_strides, dst_strides, src_expr, dst_expr = cpp.memlet_copy_to_absolute_strides(
self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._packed_types)

# Which numbers to include in the variable argument part
dynshape, dynsrc, dyndst = 1, 1, 1
Expand Down Expand Up @@ -904,7 +902,8 @@ def process_out_memlets(self,
_, uconn, v, _, memlet = edge
if skip_wcr and memlet.wcr is not None:
continue
dst_node = dfg.memlet_path(edge)[-1].dst
dst_edge = dfg.memlet_path(edge)[-1]
dst_node = dst_edge.dst

# Target is neither a data nor a tasklet node
if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode)
Expand Down Expand Up @@ -952,9 +951,12 @@ def process_out_memlets(self,

conntype = node.out_connectors[uconn]
is_scalar = not isinstance(conntype, dtypes.pointer)
if isinstance(conntype, dtypes.pointer) and sdfg.arrays[memlet.data].dtype == conntype:
is_scalar = True # Pointer to pointer assignment
is_stream = isinstance(sdfg.arrays[memlet.data], data.Stream)
is_refset = isinstance(sdfg.arrays[memlet.data], data.Reference) and dst_edge.dst_conn == 'set'

if is_scalar and not memlet.dynamic and not is_stream:
if (is_scalar and not memlet.dynamic and not is_stream) or is_refset:
out_local_name = " __" + uconn
in_local_name = uconn
if not locals_defined:
Expand Down Expand Up @@ -987,6 +989,9 @@ def process_out_memlets(self,
if defined_type == DefinedType.Scalar:
mname = cpp.ptr(memlet.data, desc, sdfg, self._frame)
write_expr = f"{mname} = {in_local_name};"
elif defined_type == DefinedType.Pointer and is_refset:
mname = cpp.ptr(memlet.data, desc, sdfg, self._frame)
write_expr = f"{mname} = {in_local_name};"
elif (defined_type == DefinedType.ArrayInterface and not isinstance(desc, data.View)):
# Special case: No need to write anything between
# array interfaces going out
Expand Down Expand Up @@ -1473,15 +1478,21 @@ def define_out_memlet(self, sdfg, state_dfg, state_id, src_node, dst_node, edge,
cdtype = src_node.out_connectors[edge.src_conn]
if isinstance(sdfg.arrays[edge.data.data], data.Stream):
pass
elif isinstance(cdtype, dtypes.pointer):
# If pointer, also point to output
elif isinstance(cdtype, dtypes.pointer): # If pointer, also point to output
desc = sdfg.arrays[edge.data.data]
ptrname = cpp.ptr(edge.data.data, desc, sdfg, self._frame)
is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent,
dtypes.AllocationLifetime.External)
defined_type, _ = self._dispatcher.defined_vars.get(ptrname, is_global=is_global)
base_ptr = cpp.cpp_ptr_expr(sdfg, edge.data, defined_type, codegen=self._frame)
callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', sdfg, state_id, src_node)

# If reference set, do not emit initial assignment
is_refset = isinstance(desc, data.Reference) and state_dfg.memlet_path(edge)[-1].dst_conn == 'set'

if not is_refset and not isinstance(desc.dtype, dtypes.pointer):
ptrname = cpp.ptr(edge.data.data, desc, sdfg, self._frame)
is_global = desc.lifetime in (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent,
dtypes.AllocationLifetime.External)
defined_type, _ = self._dispatcher.defined_vars.get(ptrname, is_global=is_global)
base_ptr = cpp.cpp_ptr_expr(sdfg, edge.data, defined_type, codegen=self._frame)
callsite_stream.write(f'{cdtype.ctype} {edge.src_conn} = {base_ptr};', sdfg, state_id, src_node)
else:
callsite_stream.write(f'{cdtype.as_arg(edge.src_conn)};', sdfg, state_id, src_node)
else:
callsite_stream.write(f'{cdtype.ctype} {edge.src_conn};', sdfg, state_id, src_node)

Expand Down
8 changes: 4 additions & 4 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,11 +484,11 @@ def cmake_options():
cuda_arch = Config.get('compiler', 'cuda', 'cuda_arch').split(',')
cuda_arch = [ca for ca in cuda_arch if ca is not None and len(ca) > 0]

flags = Config.get("compiler", "cuda", "args")
flags += ' ' + ' '.join('-gencode arch=compute_{arch},code=sm_{arch}'.format(arch=arch)
for arch in cuda_arch)
cuda_arch = ';'.join(cuda_arch)
options.append(f'-DDACE_CUDA_ARCHITECTURES_DEFAULT="{cuda_arch}"')

options.append("-DCUDA_NVCC_FLAGS=\"{}\"".format(flags))
flags = Config.get("compiler", "cuda", "args")
options.append("-DCMAKE_CUDA_FLAGS=\"{}\"".format(flags))

if backend == 'hip':
hip_arch = Config.get('compiler', 'cuda', 'hip_arch').split(',')
Expand Down
15 changes: 10 additions & 5 deletions dace/codegen/tools/get_cuda_arch.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
// Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved.
#include <cuda_runtime.h>

#include <algorithm>
#include <iostream>
#include <iterator>
#include <set>
#include <sstream>
#include <string>

int main(int argc, char **argv) {
int main() {
int count;
if (cudaGetDeviceCount(&count) != cudaSuccess) return 1;

Expand All @@ -22,10 +24,13 @@ int main(int argc, char **argv) {
architectures.insert(ss.str());
}

// Print out architectures
for (std::set<std::string>::iterator iter = architectures.begin();
iter != architectures.end(); ++iter)
std::cout << *iter << " ";
if (architectures.empty()) {
return 1;
}

std::copy(architectures.begin(), std::prev(architectures.end(), 1),
std::ostream_iterator<std::string>(std::cout, ";"));
std::cout << *architectures.rbegin();

return 0;
}
4 changes: 2 additions & 2 deletions dace/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ required:
type: str
title: nvcc Arguments
description: Compiler argument flags for CUDA
default: '-std=c++14 -Xcompiler -fPIC -O3 -Xcompiler -march=native --use_fast_math -Xcompiler -Wno-unused-parameter'
default_Windows: '-std=c++14 -O3 --use_fast_math'
default: '-Xcompiler -march=native --use_fast_math -Xcompiler -Wno-unused-parameter'
default_Windows: '-O3 --use_fast_math'

hip_args:
type: str
Expand Down
8 changes: 7 additions & 1 deletion dace/sdfg/analysis/schedule_tree/sdfg_to_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,9 +424,15 @@ def prepare_schedule_tree_edges(state: SDFGState) -> Dict[gr.MultiConnectorEdge[
# 2. Check for reference sets
if isinstance(e.dst, dace.nodes.AccessNode) and e.dst_conn == 'set':
assert isinstance(e.dst.desc(sdfg), dace.data.Reference)

# Determine source
if isinstance(mtree.root().edge.src, dace.nodes.CodeNode):
src_desc = mtree.root().edge.src
else:
src_desc = sdfg.arrays[e.data.data]
result[e] = tn.RefSetNode(target=e.dst.data,
memlet=e.data,
src_desc=sdfg.arrays[e.data.data],
src_desc=src_desc,
ref_desc=sdfg.arrays[e.dst.data])
scope = state.entry_node(e.dst if mtree.downwards else e.src)
scope_to_edges[scope].append(e)
Expand Down
7 changes: 4 additions & 3 deletions dace/sdfg/analysis/schedule_tree/treenodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ class ScheduleTreeScope(ScheduleTreeNode):
containers: Optional[Dict[str, data.Data]] = field(default_factory=dict, init=False)
symbols: Optional[Dict[str, symbol]] = field(default_factory=dict, init=False)

def __init__(self,
children: Optional[List['ScheduleTreeNode']] = None):
def __init__(self, children: Optional[List['ScheduleTreeNode']] = None):
self.children = children or []
if self.children:
for child in children:
Expand Down Expand Up @@ -350,10 +349,12 @@ class RefSetNode(ScheduleTreeNode):
"""
target: str
memlet: Memlet
src_desc: data.Data
src_desc: Union[data.Data, nodes.CodeNode]
ref_desc: data.Data

def as_string(self, indent: int = 0):
if isinstance(self.src_desc, nodes.CodeNode):
return indent * INDENTATION + f'{self.target} = refset from {type(self.src_desc).__name__.lower()}'
return indent * INDENTATION + f'{self.target} = refset to {self.memlet}'


Expand Down
21 changes: 14 additions & 7 deletions dace/sdfg/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def validate_control_flow_region(sdfg: 'dace.sdfg.SDFG',
in_default_scope = False
if sdfg.parent_nsdfg_node is not None:
if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node,
[dtypes.ScheduleType.Default]):
[dtypes.ScheduleType.Default]):
in_default_scope = True
if in_default_scope is False:
eid = region.edge_id(edge)
Expand All @@ -126,9 +126,9 @@ def validate_control_flow_region(sdfg: 'dace.sdfg.SDFG',
if start_block not in visited:
if isinstance(start_block, SDFGState):
validate_state(start_block, region.node_id(start_block), sdfg, symbols, initialized_transients, references,
**context)
**context)
else:
validate_control_flow_region(sdfg, start_block, initialized_transients, symbols, references, **context)
validate_control_flow_region(sdfg, start_block, initialized_transients, symbols, references, **context)

# Validate all inter-state edges (including self-loops not found by DFS)
for eid, edge in enumerate(region.edges()):
Expand Down Expand Up @@ -162,7 +162,7 @@ def validate_control_flow_region(sdfg: 'dace.sdfg.SDFG',
in_default_scope = False
if sdfg.parent_nsdfg_node is not None:
if is_in_scope(sdfg.parent_sdfg, sdfg.parent, sdfg.parent_nsdfg_node,
[dtypes.ScheduleType.Default]):
[dtypes.ScheduleType.Default]):
in_default_scope = True
if in_default_scope is False:
raise InvalidSDFGInterstateEdgeError(
Expand Down Expand Up @@ -453,9 +453,16 @@ def validate_state(state: 'dace.sdfg.SDFGState',

# Find uninitialized transients
if node.data not in initialized_transients:
if (arr.transient and state.in_degree(node) == 0 and state.out_degree(node) > 0
# Streams do not need to be initialized
and not isinstance(arr, dt.Stream)):
if isinstance(arr, dt.Reference): # References are considered more conservatively
if any(e.dst_conn == 'set' for e in state.in_edges(node)):
initialized_transients.add(node.data)
else:
raise InvalidSDFGNodeError(
'Reference data descriptor was used before it was set. Set '
'it with an incoming memlet to the "set" connector', sdfg, state_id, nid)
elif (arr.transient and state.in_degree(node) == 0 and state.out_degree(node) > 0
# Streams do not need to be initialized
and not isinstance(arr, dt.Stream)):
if node.setzero == False:
warnings.warn('WARNING: Use of uninitialized transient "%s" in state %s' %
(node.data, state.label))
Expand Down
Loading

0 comments on commit 351ed9a

Please sign in to comment.