Skip to content

Commit

Permalink
Merge pull request #65 from db-tu-dresden/mwe_rtl_fpga
Browse files Browse the repository at this point in the history
Minimal Working Example RTL on FPGA
  • Loading branch information
JPietrzykTUD authored Oct 6, 2023
2 parents 99986c5 + c568aa7 commit 9a1ead5
Show file tree
Hide file tree
Showing 10 changed files with 257 additions and 23 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ project(<PROJECTNAME>)
include(tools/tsl/tsl.cmake)
#tsl.cmake exports a function which can should be used to generate the TSL
create_tsl(
TSLGENERATOR_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/tools/tslgen"
DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/tools/tslgen"
TSLGENERATOR_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/tools/tsl"
DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/tools/tsl"
)
target_include_directories(<target> PUBLIC ${TSL_INCLUDE_DIRECTORY} <target_includes>...) #[1] see explanation below
Expand Down Expand Up @@ -300,11 +300,11 @@ The primitives and extensions reside in the namespace `tsl`. An minimal example
//Now we can access the TSL functionality through their namespace:
int main() {
//Now we can access the TSL functionality through their namespace.
auto _vec = tsl::set1<tsl::avx2, uint32_t>(42);
auto _vec = tsl::set1<tsl::simd<uint32_t, tsl::avx2>>(42);

{ // Of course, you can also use the namespace
using namespace tsl;
to_ostream<avx2, uint32_t>(std::cout, _vec) << std::endl;
to_ostream<simd<uint32_t, avx2>(std::cout, _vec) << std::endl;
// This should print the following to stdout: [42,42,42,42,42,42,42,42]
}
return 0;
Expand Down
83 changes: 83 additions & 0 deletions examples/oneAPIfpga/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# CMakeFile for building the example(s).
# The following parameters can be passed to cmake:
# BOARD: name of fpga board [intel_s10sx_pac:pac_s10_usm]

cmake_minimum_required(VERSION 3.13)
project(oneAPIfpgaExamples)


set(TSLROOT ${CMAKE_SOURCE_DIR}/../..)

set(CMAKE_VERBOSE_MAKEFILE ON)

set(project_cxx_standard 20)
set(release_cxx_flag "-O2")

set(warnings "-Wall;-Wextra;-Wpedantic")
set(release_warnings "-Winline")

if (DEFINED BOARD)
set(fpga_board ${BOARD})
else()
set(fpga_board "intel_s10sx_pac:pac_s10_usm")
endif()

set(fpga_link_options -qactypes -fsycl -fintelfpga)

# generate the TSL (with SSE, AVX, AVX512, ONEAPIfpga) assuming Intel Xeon Gold Cascade Lake (e.g., Xeon Gold 6238R)
include(${TSLROOT}/tsl.cmake)
create_tsl(
TSLGENERATOR_DIRECTORY "${TSLROOT}"
DESTINATION "${CMAKE_BINARY_DIR}/tsl"
TARGETS_FLAGS "sse;sse2;ssse3;sse4_1;sse4_2;avx;avx2;avx512f;avx512dq;avx512cd;avx512bw;avx512vl;avx512_vnni;bmi1;bmi2;oneAPIfpgaDev"
USE_CONCEPTS
LINK_OPTIONS ${fpga_link_options}
)
#create_tsl(
# TSLGENERATOR_DIRECTORY "${TSLROOT}"
# DESTINATION "${CMAKE_BINARY_DIR}/tsl"
# TARGETS_FLAGS "oneAPIfpgaDev"
# USE_CONCEPTS
# LINK_OPTIONS ${fpga_link_options}
#)
message(STATUS "TSL Include Directory: ${TSL_INCLUDE_DIRECTORY}")


########################################################
## Build emulator
########################################################
function(create_fpga_emulator_target targetName mainFile)
set(exec_target_name ${targetName}.fpga.emu)
add_executable(${exec_target_name} ${mainFile})
target_include_directories(${exec_target_name} PRIVATE ${TSL_INCLUDE_DIRECTORY})
target_link_libraries(${exec_target_name} tsl)
target_link_libraries(${exec_target_name} libtslOneAPIFPGA)
set_target_properties(${exec_target_name} PROPERTIES CXX_STANDARD ${project_cxx_standard})
target_compile_options(${exec_target_name} PRIVATE -fsycl ${release_cxx_flag} ${warnings} ${release_warnings} -fintelfpga -qactypes)
target_link_options(${exec_target_name} PRIVATE ${fpga_link_options})
endfunction()

########################################################
## Build hardware
########################################################
function(create_fpga_target targetName mainFile)
set(exec_target_name ${targetName}.fpga)
add_executable(${exec_target_name} ${mainFile})
target_include_directories(${exec_target_name} PRIVATE ${TSL_INCLUDE_DIRECTORY})
target_link_libraries(${exec_target_name} tsl)
target_link_libraries(${exec_target_name} libtslOneAPIFPGA)
set_target_properties(${exec_target_name} PROPERTIES CXX_STANDARD ${project_cxx_standard})
target_compile_definitions(${exec_target_name} PRIVATE ONEAPI_FPGA_HARDWARE)
target_compile_options(${exec_target_name} PRIVATE -fsycl ${release_cxx_flag} ${warnings} ${release_warnings} -fintelfpga -Xsoutput-report-folder=${targetName}.prj -qactypes)
target_link_options(${exec_target_name} PRIVATE -qactypes -fsycl -fintelfpga -Xshardware -Xsboard=${fpga_board} -reuse-exe=${CMAKE_CURRENT_BINARY_DIR}/${exec_target_name})
endfunction()

if(NOT DEFINED ${TARGET})
message(STATUS "No target specified. Assuming emulator")
set(TARGET EMULATOR)
endif()
if(${TARGET} STREQUAL "EMULATOR")
create_fpga_emulator_target(clz_rtl clz_rtl_example.cpp)
elseif(${TARGET} STREQUAL "FPGA_HARDWARE")
create_fpga_target(clz_rtl clz_rtl_example.cpp)
endif()
18 changes: 18 additions & 0 deletions examples/oneAPIfpga/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
#check if argument passed
if [ $# -eq 0 ]; then
echo "No target specified. Using emulator"
TARGET=EMULATOR
else
#check if argument is either emu or hw
if [ $1 != "emu" ] && [ $1 != "hw" ]; then
echo "Invalid target (emu|hw) specified. Using emulator"
TARGET=EMULATOR
elif [ $1 == "emu" ]; then
TARGET=EMULATOR
else
TARGET=FPGA_HARDWARE
fi
fi
CC=icx CXX=icpx cmake -B build -S . -DCMAKE_BUILD_TYPE=Release -DTARGET=$TARGET
cmake --build build
94 changes: 94 additions & 0 deletions examples/oneAPIfpga/clz_rtl_example.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include <iostream>
#include "tslintrin.hpp"

template<class SimdT, typename PtrTOut, typename PtrTIn, typename SizeT>
struct count_leading_zero_kernel {
static void apply(PtrTOut out, PtrTIn in, SizeT element_count) {
for (size_t i = 0; i < element_count; i+=SimdT::vector_element_count()) {
auto in_reg = tsl::loadu<SimdT>(&in[i]);
auto result_reg = tsl::lzc<SimdT>(in_reg);
tsl::storeu<SimdT>(&out[i], result_reg);
}
}
};

int main(void) {
// so far, only 32-bit unsigned integers are supported as RTL code

using namespace tsl;
executor<runtime::cpu> cpu_executor;
using cpu_simd = simd<uint32_t, avx512>;

executor<runtime::oneAPI_default_fpga> fpga_executor{
sycl::property_list{sycl::property::queue::enable_profiling()}
};
using fpga_simd = simd<uint32_t, oneAPIfpgaRTL, 512>;

// allocate memory on host
auto host_mem_data = cpu_executor.allocate<uint32_t>(128);
auto host_mem_result = cpu_executor.allocate<uint32_t>(128);
// allocate memory accessible from host and FPGA device
// WATCH OUT: oneAPI::MEMORY_ON_HOST and oneAPI::MEMORY_ON_DEVICE will soon be moved up in the namespace hierarchy
auto usm_host_mem_data = fpga_executor.allocate<uint32_t>(128, oneAPI::MEMORY_ON_HOST{});
auto usm_host_mem_result = fpga_executor.allocate<uint32_t>(128, oneAPI::MEMORY_ON_HOST{});
// allocate memory on FPGA device
auto usm_dev_mem_data = fpga_executor.allocate<uint32_t>(128, oneAPI::MEMORY_ON_DEVICE{});
auto usm_dev_mem_result = fpga_executor.allocate<uint32_t>(128, oneAPI::MEMORY_ON_DEVICE{});

// initialize input data
for (size_t i = 0; i < 128; i++) {
host_mem_data[i] = i;
usm_host_mem_data[i] = i;
}
// copy input data to FPGA device
fpga_executor.copy(usm_dev_mem_data, usm_host_mem_data, 128);

// initialize output
for (size_t i = 0; i < 128; i++) {
host_mem_result[i] = 0;
usm_host_mem_result[i] = 0;
}
// copy output to FPGA device
fpga_executor.copy(usm_dev_mem_result, usm_host_mem_result, 128);


// run kernel on CPU using avx512
cpu_executor.submit<cpu_simd, count_leading_zero_kernel>(host_mem_result, host_mem_data, (size_t)128);

// run kernel on FPGA using oneAPIfpgaRTL (RTL code is built seperately). Use USM-Host memory
fpga_executor.submit<fpga_simd, count_leading_zero_kernel>(usm_host_mem_result, usm_host_mem_data, (size_t)128);

// check results
for (size_t i = 0; i < 128; i++) {
if (host_mem_result[i] != usm_host_mem_result[i]) {
std::cerr << "ERROR: host_mem_result[" << i << "] = " << host_mem_result[i] << " != usm_host_mem_result[" << i << "] = " << usm_host_mem_result[i] << std::endl;
std::terminate();
}
}

// run kernel on FPGA using oneAPIfpgaRTL (RTL code is built seperately). Use USM-Device memory
fpga_executor.submit<fpga_simd, count_leading_zero_kernel>(usm_dev_mem_result, usm_dev_mem_data, (size_t)128);

// copy output to host
fpga_executor.copy(usm_host_mem_result, usm_dev_mem_result, 128);

// check results
for (size_t i = 0; i < 128; i++) {
if (host_mem_result[i] != usm_host_mem_result[i]) {
std::cerr << "ERROR: host_mem_result[" << i << "] = " << host_mem_result[i] << " != usm_host_mem_result[" << i << "] = " << usm_host_mem_result[i] << std::endl;
std::terminate();
}
}

// free memory
fpga_executor.deallocate(usm_dev_mem_result);
fpga_executor.deallocate(usm_dev_mem_data);
fpga_executor.deallocate(usm_host_mem_result);
fpga_executor.deallocate(usm_host_mem_data);
cpu_executor.deallocate(host_mem_result);
cpu_executor.deallocate(host_mem_data);

// done
std::cout << "Everything worked fine!" << std::endl;
return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ add_subdirectory({{ supplementary_lib["cmakelists_path"] }})
add_dependencies({{ tsl_lib_name}} {{ supplementary_lib["name"] }})
{% endfor %}
# Add includes if present
{% for supplementary_lib in tsl_required_supplementary_libraries %}
target_include_directories({{ tsl_lib_name}} INTERFACE {{ supplementary_lib["include_path"] }})
{% endfor %}
{% for include_path in tsl_additional_include_paths %}
#target_include_directories({{ tsl_lib_name}} INTERFACE {{ include_path }}/)
target_include_directories({{ tsl_lib_name }} INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/{{ include_path }}> $<INSTALL_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/{{ include_path }} )
{% endfor %}
5 changes: 5 additions & 0 deletions generator/static_files/core/utils/runtime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ file_description: "Static header that defines the runtime."
includes:
- "<cstdlib>"
- "<cstddef>"
- "<utility>"
implementations:
- |
template<class ExecTarget>
Expand All @@ -29,6 +30,10 @@ implementations:
target.copy(out, in, element_count);
}
public:
template<VectorProcessingStyle PS, template<typename...> class Fun, typename... Args>
decltype(auto) submit(Args... args) {
return target.template submit<PS, Fun>(args...);
}
template<template<typename...> class Fun, typename... Args>
decltype(auto) submit(Args... args) {
return target.template submit<Fun>(args...);
Expand Down
43 changes: 31 additions & 12 deletions supplementary/oneApiFPGA/one_api_lib_create.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,14 @@ set(ONE_API_FPGA_OBJECTS PARENT_SCOPE)
function(register_one_api_fpga_object)
set(options)
set(oneValueArgs NAME)
set(multiValueArgs SOURCES SPECS)
set(multiValueArgs SOURCES SPECS VFILE)
cmake_parse_arguments(REGISTER_AND_BUILD_ONE_API_FPGA_OBJECT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

message(STATUS "Inside register_one_api_fpga_object")
message(STATUS "[REGISTER_ONE_API_FPGA_OBJECT]: Inside register_one_api_fpga_object")
message(STATUS "[REGISTER_ONE_API_FPGA_OBJECT]: ${ONE_API_FPGA_CROSSGEN} -v -fPIC ${REGISTER_AND_BUILD_ONE_API_FPGA_OBJECT_SPECS} --emulation_model
${REGISTER_AND_BUILD_ONE_API_FPGA_OBJECT_SOURCES} --target sycl
-o ${REGISTER_AND_BUILD_ONE_API_FPGA_OBJECT_NAME}.o
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}")
add_custom_command(
OUTPUT ${REGISTER_AND_BUILD_ONE_API_FPGA_OBJECT_NAME}.o
COMMAND
Expand All @@ -42,7 +46,8 @@ function(register_one_api_fpga_object)
list(APPEND ONE_API_FPGA_OBJECTS_TARGETS "one_api_fpga_object_${REGISTER_AND_BUILD_ONE_API_FPGA_OBJECT_NAME}")
set(ONE_API_FPGA_OBJECTS ${ONE_API_FPGA_OBJECTS} PARENT_SCOPE)
set(ONE_API_FPGA_OBJECTS_TARGETS ${ONE_API_FPGA_OBJECTS_TARGETS} PARENT_SCOPE)
message(STATUS "register_one_api_fpga_object targets: ${ONE_API_FPGA_OBJECTS_TARGETS}")
message(STATUS "[REGISTER_ONE_API_FPGA_OBJECT]: register_one_api_fpga_object: ${ONE_API_FPGA_OBJECTS}")
message(STATUS "[REGISTER_ONE_API_FPGA_OBJECT]: register_one_api_fpga_object targets: ${ONE_API_FPGA_OBJECTS_TARGETS}")
endfunction()


Expand All @@ -52,19 +57,31 @@ function(create_one_api_fpga_library)
set(multiValueArgs)
cmake_parse_arguments(CREATE_ONE_API_FPGA_LIBRARY "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

set(LIBRARY_ARCHIVE ${CMAKE_BINARY_DIR}/${CREATE_ONE_API_FPGA_LIBRARY_NAME}.a)

message(STATUS "[CREATE_ONE_API_FPGA_LIBRARY]: custom target: CREATE_${CREATE_ONE_API_FPGA_LIBRARY_NAME}
DEPENDS ${LIBRARY_ARCHIVE}")
add_custom_target(CREATE_${CREATE_ONE_API_FPGA_LIBRARY_NAME}
DEPENDS ${LIBRARY_ARCHIVE}
)
message(STATUS "[CREATE_ONE_API_FPGA_LIBRARY]: ${ONE_API_FPGA_LIBTOOL} -v ${ONE_API_FPGA_OBJECTS}
--target sycl --create ${LIBRARY_ARCHIVE}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS ${ONE_API_FPGA_OBJECTS_TARGETS}")
add_custom_command(
OUTPUT ${CREATE_ONE_API_FPGA_LIBRARY_NAME}.a
OUTPUT ${LIBRARY_ARCHIVE}
COMMAND
${ONE_API_FPGA_LIBTOOL} -v ${ONE_API_FPGA_OBJECTS}
--target sycl --create ${CREATE_ONE_API_FPGA_LIBRARY_NAME}.a
${ONE_API_FPGA_LIBTOOL} ${ONE_API_FPGA_OBJECTS}
--target sycl --create ${LIBRARY_ARCHIVE}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Running fpga_libtool to build ${CREATE_ONE_API_FPGA_LIBRARY_NAME}.a"
DEPENDS ${ONE_API_FPGA_OBJECTS_TARGETS}
COMMENT "Running fpga_libtool to build ${LIBRARY_ARCHIVE}"
)
add_custom_target(${CREATE_ONE_API_FPGA_LIBRARY_NAME}
DEPENDS ${CREATE_ONE_API_FPGA_LIBRARY_NAME}.a
)
message(STATUS "Name: ${CREATE_ONE_API_FPGA_LIBRARY_NAME}. Targets: ${ONE_API_FPGA_OBJECTS_TARGETS}")
add_dependencies(${CREATE_ONE_API_FPGA_LIBRARY_NAME} ${ONE_API_FPGA_OBJECTS_TARGETS})

message(STATUS "[CREATE_ONE_API_FPGA_LIBRARY]: Name: ${CREATE_ONE_API_FPGA_LIBRARY_NAME}. Targets: ${ONE_API_FPGA_OBJECTS_TARGETS}")
add_library(${CREATE_ONE_API_FPGA_LIBRARY_NAME} STATIC IMPORTED GLOBAL)
add_dependencies(${CREATE_ONE_API_FPGA_LIBRARY_NAME} CREATE_${CREATE_ONE_API_FPGA_LIBRARY_NAME})
set_target_properties(${CREATE_ONE_API_FPGA_LIBRARY_NAME} PROPERTIES IMPORTED_LOCATION ${LIBRARY_ARCHIVE})
endfunction()


Expand All @@ -73,7 +90,9 @@ register_one_api_fpga_object(
NAME "rtl_lzc32"
SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_rtl_model_lzc32.cpp
SPECS ${CMAKE_CURRENT_SOURCE_DIR}/specs/lib_rtl_spec_lzc32.xml
VFILE ${CMAKE_CURRENT_SOURCE_DIR/specs/lib_rtl_lzc32.v}
)
message(STATUS "After call: ${ONE_API_FPGA_OBJECTS_TARGETS}")

create_one_api_fpga_library(NAME libtslOneAPIFPGA)

4 changes: 4 additions & 0 deletions supplementary/runtime/cpu/include/tslCPUrt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ namespace tsl {
}
}
public:
template<VectorProcessingStyle PS, template<typename...> class Fun, typename... Args>
decltype(auto) submit(Args... args) {
return Fun<PS, Args...>::apply(args...);
}
template<template<typename...> class Fun, typename... Args>
decltype(auto) submit(Args... args) {
return Fun<Args...>::apply(args...);
Expand Down
11 changes: 11 additions & 0 deletions supplementary/runtime/oneApiFPGA/include/tslOneAPIrt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,17 @@ namespace tsl {
q.wait();
}
public:
template<VectorProcessingStyle PS, template<typename...> class Fun, typename... Args>
decltype(auto) submit(Args... args) {
using FunctorClass = Fun<PS, Args...>;
return q.submit(
[&](sycl::handler& h) {
h.single_task<FunctorClass>([=]() [[intel::kernel_args_restrict]] {
return FunctorClass::apply(args...);
});
}
).wait();
}
template<template<typename...> class Fun, typename... Args>
decltype(auto) submit(Args... args) {
return q.submit(
Expand Down
10 changes: 4 additions & 6 deletions tsl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@ function(create_tsl)

if (CREATE_TSL_ARGS_TARGETS_FLAGS STREQUAL "" OR NOT DEFINED CREATE_TSL_ARGS_TARGETS_FLAGS)
set(TARGETS_FLAGS "") # STRING "space separated lscpu flags for --targets, will attempt to call lscpu if empty"
if(LSCPU_FLAGS STREQUAL "")
execute_process(
COMMAND "${Python3_EXECUTABLE}" -c "import cpuinfo; print(*cpuinfo.get_cpu_info()['flags'])"
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE TARGETS_FLAGS
)
endif()
execute_process(
COMMAND "${Python3_EXECUTABLE}" -c "import cpuinfo; print(*cpuinfo.get_cpu_info()['flags'])"
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE TARGETS_FLAGS
)
if(TARGETS_FLAGS STREQUAL "")
execute_process(
COMMAND bash -c "LANG=en;lscpu|grep -i flags | tr ' ' '\n' | grep -v -E '^Flags:|^$' | sort -d | tr '\n' ' '"
Expand Down

0 comments on commit 9a1ead5

Please sign in to comment.