From 06725beac47a03226554940c044b7c5964821236 Mon Sep 17 00:00:00 2001 From: makslevental Date: Tue, 27 Aug 2024 17:14:14 -0500 Subject: [PATCH 1/3] [wip] hsa demo --- .github/workflows/ci-linux.yml | 19 ++ experimental/CMakeLists.txt | 2 +- experimental/hsa/CMakeLists.txt | 9 + experimental/hsa/add_one.pdi | Bin 0 -> 3552 bytes experimental/hsa/add_one_insts.txt | 68 +++++ experimental/hsa/aie_hsa_dispatch_test.cc | 309 ++++++++++++++++++++++ 6 files changed, 406 insertions(+), 1 deletion(-) create mode 100644 experimental/hsa/CMakeLists.txt create mode 100644 experimental/hsa/add_one.pdi create mode 100644 experimental/hsa/add_one_insts.txt create mode 100644 experimental/hsa/aie_hsa_dispatch_test.cc diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 100ea0eca..383b99d8a 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -200,3 +200,22 @@ jobs: iree-install \ print_ir_aie2xclbin_results \ $PWD/llvm-aie + + test_hsa: + name: HSA MWE + strategy: + fail-fast: false + runs-on: linux-phoenix-20240606 + steps: + - name: "Checking out repository" # for test scripts + uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + with: + submodules: false # not required for testbench + + - name: Build and run HSA example + run: | + cd experimental/hsa + mkdir build && pushd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja + ninja aie_hsa_dispatch_test + ./aie_hsa_dispatch_test $PWD/.. diff --git a/experimental/CMakeLists.txt b/experimental/CMakeLists.txt index ae2678c84..dee0aab59 100644 --- a/experimental/CMakeLists.txt +++ b/experimental/CMakeLists.txt @@ -4,4 +4,4 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -iree_add_all_subdirs() +add_subdirectory(delegate) diff --git a/experimental/hsa/CMakeLists.txt b/experimental/hsa/CMakeLists.txt new file mode 100644 index 000000000..8c299a30a --- /dev/null +++ b/experimental/hsa/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +find_package(hsa-runtime64 REQUIRED) +add_executable(aie_hsa_dispatch_test aie_hsa_dispatch_test.cc) +target_link_libraries(aie_hsa_dispatch_test PUBLIC hsa-runtime64::hsa-runtime64) diff --git a/experimental/hsa/add_one.pdi b/experimental/hsa/add_one.pdi new file mode 100644 index 0000000000000000000000000000000000000000..f21475d21f3ef286d180bea6e2a3127c53a48239 GIT binary patch literal 3552 zcmcInO>7%Q6n^VHc)u)J;mr zN*D9Ey|nSJ>$4nB9)I@2xic4DJ$wG#w!`=~^rN}Xp4JYP=rB6pG?ev30Xc@u z`#1R5n!Oo>gH-=>h(OeeIm~^&Xh#q%qCnxcI;BCNT+pPUI!{~s9)j9G)}~J?lRr!; zoswTc9wY)4*Zo9!?a}5die*tzK`x!&;GYv4ww=i)4jmaM`YiaO8}8Zc)Q{y~JQm-L zDCygBKJ;l6zD(IyCVM=ln$PeNp(ce!dB?mT8o{;8dfzr~Z-tFp8)4(-df2#Yhm8^R;mvUV z#=S7+GZUcHlKoI8w%v*TWF>!M{;qwXroEt3Y(=5x6}6z3lbGP-D18AWb`u!u>+Jm) zdSM}!z4gMnCwcrW5*hQxJMyCX4*OF_*_t(Xd&m0c1C>#|->7B?>&%)W|M2J7Q%kBj zTyy>ZMgGce{$_d*FqB39W_m>afJOeULN6@7e8IXWd0ai^^XJHy)ce?wy3QhiQ zx1%Zqo&E66?{4#7o$u=L|7-qBH|}YfyS9e9fphNVr+4W*b?Q{KHKVanH)7j$MtVLu zJ(Uq+KO1!)m+MjH!-kksRk`X7QVyj%)!A_M#MBYO-jxGI!&H;T9Xp)wLlr7+g}J^a z$)emTTO%r6RyA){A_l5!COh5g>GH#?P4KfEz&1EPZ@icumHM`@KJ+`bR(jobeaS1D z0C9}TD;y7d+X7buq!e_k4`?Y(y${cIEIe@BN>k>E`gmsTQhacJ1n&uscX91{EN_0T zr#j#J=#PX(rD;_wU0#>wY@GJ8$NAvrwIL^-UXDq|)Nj#D^FFuQE0k%z*ut|zj;55h zb`b;Tb=rFe@VhXeKr-9;73p+IW}hm*E6Z&R1=b~ON;n~5OTuXh4@r1b!Z``gNO)et zixMtMcv-?L5-v;F1m^YeS`xC}a$Y_{FJY77+k9Ye?IiipB|jwbb(t^u)Lk4w$cW;^ ztyB1m0&TgCOw<5@rOV2R-~s>lhC?wOK2D9lfnQ&XZwbETCqw%6Oz^>4{2|9be?Ivd z_|2TF;LmMBFV2pOc|~lCLKbHU=SY`q5fJK>eXhS2vVd~b;>!M;WuJUQ=9}O#zewKr zf96{*KSx|-ceazZev^aoshQqmJ0B$QLZk1Gc`-vU1Zy@yHC$Dzl%1ScrzgV5ja>bqrq txm}u1d&OrI`ih@bM=x@Rk0}1OCp;l62O;x+FCCrFt>BnqERV0WzW_#=ycPfe literal 0 HcmV?d00001 diff --git a/experimental/hsa/add_one_insts.txt b/experimental/hsa/add_one_insts.txt new file mode 100644 index 000000000..a5e9f9d33 --- /dev/null +++ b/experimental/hsa/add_one_insts.txt @@ -0,0 +1,68 @@ +06030100 +00000105 +00000007 +00000110 +00000001 +00000000 +0001D000 +00000030 +00000400 +00000000 +00000000 +00000000 +80000000 +00000000 +00000000 +02000000 +00000081 +00000030 +00000000 +00000000 +00000000 +00000000 +0001D004 +00000000 +00000001 +00000000 +00000000 +00000000 +00000000 +00000000 +0001D204 +00000000 +80000000 +00000018 +00000001 +00000000 +0001D020 +00000030 +00000400 +00000000 +00000000 +00000000 +80000000 +00000000 +00000000 +02000000 +00000081 +00000030 +00000000 +00000000 +00000000 +00000000 +0001D024 +00000000 +00000000 +00000000 +00000000 +00000000 +00000000 +00000000 +0001D214 +00000000 +00000001 +00000018 +00000080 +00000010 +00000000 +00010100 diff --git a/experimental/hsa/aie_hsa_dispatch_test.cc b/experimental/hsa/aie_hsa_dispatch_test.cc new file mode 100644 index 000000000..799a060df --- /dev/null +++ b/experimental/hsa/aie_hsa_dispatch_test.cc @@ -0,0 +1,309 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +namespace { + +hsa_status_t get_agent(hsa_agent_t agent, std::vector *agents, + hsa_device_type_t requested_dev_type) { + if (!agents || !(requested_dev_type == HSA_DEVICE_TYPE_AIE || + requested_dev_type == HSA_DEVICE_TYPE_GPU || + requested_dev_type == HSA_DEVICE_TYPE_CPU)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + hsa_device_type_t device_type; + hsa_status_t ret = + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); + + if (ret != HSA_STATUS_SUCCESS) { + return ret; + } + + if (device_type == requested_dev_type) { + agents->push_back(agent); + } + + return ret; +} + +hsa_status_t get_aie_agents(hsa_agent_t agent, void *data) { + if (!data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + auto *aie_agents = reinterpret_cast *>(data); + return get_agent(agent, aie_agents, HSA_DEVICE_TYPE_AIE); +} + +hsa_status_t get_coarse_global_mem_pool(hsa_amd_memory_pool_t pool, void *data, + bool kernarg) { + hsa_amd_segment_t segment_type; + auto ret = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); + if (ret != HSA_STATUS_SUCCESS) { + return ret; + } + + if (segment_type == HSA_AMD_SEGMENT_GLOBAL) { + hsa_amd_memory_pool_global_flag_t global_pool_flags; + ret = hsa_amd_memory_pool_get_info( + pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_pool_flags); + if (ret != HSA_STATUS_SUCCESS) { + return ret; + } + + if (kernarg) { + if ((global_pool_flags & + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) && + (global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) { + *static_cast(data) = pool; + } + } else { + if ((global_pool_flags & + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) && + !(global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) { + *static_cast(data) = pool; + } + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t get_coarse_global_dev_mem_pool(hsa_amd_memory_pool_t pool, + void *data) { + return get_coarse_global_mem_pool(pool, data, false); +} + +hsa_status_t get_coarse_global_kernarg_mem_pool(hsa_amd_memory_pool_t pool, + void *data) { + return get_coarse_global_mem_pool(pool, data, true); +} + +void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, + void **buf) { + std::ifstream bin_file(file_name, + std::ios::binary | std::ios::ate | std::ios::in); + + assert(bin_file.fail() == false); + + auto size(bin_file.tellg()); + + bin_file.seekg(0, std::ios::beg); + auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); + assert(r == HSA_STATUS_SUCCESS); + bin_file.read(reinterpret_cast(*buf), size); +} + +void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, + void **buf) { + std::ifstream bin_file(file_name, + std::ios::binary | std::ios::ate | std::ios::in); + + assert(bin_file.fail() == false); + + auto size(bin_file.tellg()); + bin_file.seekg(0, std::ios::beg); + std::vector pdi_vec; + std::string val; + + while (bin_file >> val) { + pdi_vec.push_back(std::stoul(val, nullptr, 16)); + } + auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); + assert(r == HSA_STATUS_SUCCESS); + std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); +} + +} // namespace + +int main(int argc, char **argv) { + std::filesystem::path sourcePath(argv[1]); + // List of AIE agents in the system. + std::vector aie_agents; + // For creating a queue on an AIE agent. + hsa_queue_t *aie_queue(nullptr); + // Memory pool for allocating device-mapped memory. Used for PDI/DPU + // instructions. + hsa_amd_memory_pool_t global_dev_mem_pool{0}; + // System memory pool. Used for allocating kernel argument data. + hsa_amd_memory_pool_t global_kernarg_mem_pool{0}; + const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt"); + const std::string pdi_file_name(sourcePath / "add_one.pdi"); + uint32_t *dpu_inst_buf(nullptr); + uint64_t *pdi_buf(nullptr); + + assert(aie_agents.empty()); + assert(global_dev_mem_pool.handle == 0); + assert(global_kernarg_mem_pool.handle == 0); + + // Initialize the runtime. + auto r = hsa_init(); + assert(r == HSA_STATUS_SUCCESS); + + assert(sizeof(hsa_kernel_dispatch_packet_s) == + sizeof(hsa_amd_aie_ert_packet_s)); + + // Test a launch of an AIE kernel using the HSA API. + // Find the AIE agents in the system. + r = hsa_iterate_agents(get_aie_agents, &aie_agents); + assert(r == HSA_STATUS_SUCCESS); + // assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) == + // HSA_STATUS_SUCCESS); + assert(aie_agents.size() == 1); + + const auto &aie_agent = aie_agents.front(); + + // Create a queue on the first agent. + r = hsa_queue_create(aie_agent, 64, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr, + 0, 0, &aie_queue); + assert(r == HSA_STATUS_SUCCESS); + assert(aie_queue); + assert(aie_queue->base_address); + + // Find a pool for DEV BOs. This is a global system memory pool that is + // mapped to the device. Will be used for PDIs and DPU instructions. + r = hsa_amd_agent_iterate_memory_pools( + aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool); + assert(r == HSA_STATUS_SUCCESS); + + // Find a pool that supports kernel args. This is just normal system memory. + // It will be used for commands and input data. + r = hsa_amd_agent_iterate_memory_pools( + aie_agent, get_coarse_global_kernarg_mem_pool, &global_kernarg_mem_pool); + assert(r == HSA_STATUS_SUCCESS); + assert(global_kernarg_mem_pool.handle); + + // Load the DPU and PDI files into a global pool that doesn't support kernel + // args (DEV BO). + load_dpu_file(global_dev_mem_pool, dpu_inst_file_name, + reinterpret_cast(&dpu_inst_buf)); + uint32_t dpu_handle = 0; + r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(dpu_handle != 0); + + load_pdi_file(global_dev_mem_pool, pdi_file_name, + reinterpret_cast(&pdi_buf)); + uint32_t pdi_handle = 0; + r = hsa_amd_get_handle_from_vaddr(pdi_buf, &pdi_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(pdi_handle != 0); + + hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle, + .cu_func = 0}; + + hsa_amd_aie_ert_hw_ctx_config_cu_param_t config_cu_args{ + .num_cus = 1, .cu_configs = &cu_config}; + + // Configure the queue's hardware context. + r = hsa_amd_queue_hw_ctx_config( + aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args); + assert(r == HSA_STATUS_SUCCESS); + + // create inputs / outputs + constexpr std::size_t num_data_elements = 1024; + constexpr std::size_t data_buffer_size = + num_data_elements * sizeof(std::uint32_t); + + std::uint32_t *input = {}; + r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, + reinterpret_cast(&input)); + assert(r == HSA_STATUS_SUCCESS); + std::uint32_t input_handle = {}; + r = hsa_amd_get_handle_from_vaddr(input, &input_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(input_handle != 0); + + std::uint32_t *output = {}; + r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0, + reinterpret_cast(&output)); + assert(r == HSA_STATUS_SUCCESS); + std::uint32_t output_handle = {}; + r = hsa_amd_get_handle_from_vaddr(output, &output_handle); + assert(r == HSA_STATUS_SUCCESS); + assert(output_handle != 0); + + for (std::size_t i = 0; i < num_data_elements; i++) { + *(input + i) = i; + *(output + i) = 0xDEFACE; + } + + ///////////////////////////////////// Creating the cmd packet + // Creating a packet to store the command + hsa_amd_aie_ert_packet_t *cmd_pkt = NULL; + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, + reinterpret_cast(&cmd_pkt)); + assert(r == HSA_STATUS_SUCCESS); + cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_pkt->count = 0xA; // # of arguments to put in command + cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU; + cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; + cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC + << HSA_PACKET_HEADER_TYPE; + + // Creating the payload for the packet + hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL; + uint32_t cmd_handle; + r = hsa_amd_get_handle_from_vaddr(reinterpret_cast(cmd_pkt), + &cmd_handle); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, + reinterpret_cast(&cmd_payload)); + assert(r == HSA_STATUS_SUCCESS); + cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command + cmd_payload->data[0] = 0x3; // Transaction opcode + cmd_payload->data[1] = 0x0; + cmd_payload->data[2] = dpu_handle; + cmd_payload->data[3] = 0x0; + cmd_payload->data[4] = 0x44; // Size of DPU instruction + cmd_payload->data[5] = input_handle; + cmd_payload->data[6] = 0; + cmd_payload->data[7] = output_handle; + cmd_payload->data[8] = 0; + cmd_pkt->payload_data = reinterpret_cast(cmd_payload); + + uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1); + uint64_t packet_id = wr_idx % aie_queue->size; + reinterpret_cast( + aie_queue->base_address)[packet_id] = *cmd_pkt; + hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx); + + for (std::size_t i = 0; i < num_data_elements; i++) { + const auto expected = *(input + i) + 1; + const auto result = *(output + i); + assert(result == expected); + } + + r = hsa_queue_destroy(aie_queue); + assert(r == HSA_STATUS_SUCCESS); + + r = hsa_amd_memory_pool_free(output); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(input); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(pdi_buf); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(dpu_inst_buf); + assert(r == HSA_STATUS_SUCCESS); + + r = hsa_shut_down(); + assert(r == HSA_STATUS_SUCCESS); + std::cout << "PASS"; +} From d174fd44fe06629dd5a7fce91aa13c9077888162 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 27 Aug 2024 22:09:27 -0500 Subject: [PATCH 2/3] add bare example --- .github/workflows/ci-linux.yml | 3 + experimental/hsa/CMakeLists.txt | 2 + experimental/hsa/aie_hsa_bare_add_one.cc | 523 ++++++++++++++++++++ experimental/hsa/aie_hsa_dispatch_test.cc | 2 +- experimental/hsa/amdxdna_accel.h | 569 ++++++++++++++++++++++ experimental/hsa/hsa_ipu.h | 270 ++++++++++ 6 files changed, 1368 insertions(+), 1 deletion(-) create mode 100644 experimental/hsa/aie_hsa_bare_add_one.cc create mode 100644 experimental/hsa/amdxdna_accel.h create mode 100644 experimental/hsa/hsa_ipu.h diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 383b99d8a..5071219ab 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -219,3 +219,6 @@ jobs: cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja ninja aie_hsa_dispatch_test ./aie_hsa_dispatch_test $PWD/.. + + ninja aie_hsa_bare_add_one + ./aie_hsa_bare_add_one $PWD/.. diff --git a/experimental/hsa/CMakeLists.txt b/experimental/hsa/CMakeLists.txt index 8c299a30a..b1655008e 100644 --- a/experimental/hsa/CMakeLists.txt +++ b/experimental/hsa/CMakeLists.txt @@ -7,3 +7,5 @@ find_package(hsa-runtime64 REQUIRED) add_executable(aie_hsa_dispatch_test aie_hsa_dispatch_test.cc) target_link_libraries(aie_hsa_dispatch_test PUBLIC hsa-runtime64::hsa-runtime64) + +add_executable(aie_hsa_bare_add_one aie_hsa_bare_add_one.cc) diff --git a/experimental/hsa/aie_hsa_bare_add_one.cc b/experimental/hsa/aie_hsa_bare_add_one.cc new file mode 100644 index 000000000..f5843d7e9 --- /dev/null +++ b/experimental/hsa/aie_hsa_bare_add_one.cc @@ -0,0 +1,523 @@ + +/* + +RUN: (add_one_test %S) | FileCheck %s +CHECK: /dev/accel/accel0 open +CHECK: Driver version 1.1 +CHECK: Heap buffer @: 0x7f313c000000 +CHECK: Loading pdi +CHECK: Pdi file size: 3552 +CHECK: Loading dpu inst +CHECK: Loading dpu inst +CHECK: DPU 0 instructions @: 0x7f313c008000 +CHECK: DPU 1 instructions @: 0x7f313c010000 +CHECK: PDI file @: 0x7f313c000000 +CHECK: PDI handle @: 2 +CHECK: Input @: 0x7f313c018000 +CHECK: Output @: 0x7f313c020000 +CHECK: Input @: 0x7f313c028000 +CHECK: Output @: 0x7f313c030000 +CHECK: Size of param_config_cu: 0x8 +CHECK: Synch bo ioctl failed for handle 11 +CHECK: Synch bo ioctl failed for handle 9 +CHECK: Synch bo ioctl failed for handle 10 +CHECK: Checking run 0: +CHECK: Checking run 1: +CHECK: PASS! +CHECK: Closing +CHECK: Done + + */ + +#include +#include +#include +#include +#include + +#include "amdxdna_accel.h" +#include "hsa_ipu.h" + +#define DATA_BUFFER_SIZE (1024 * 4) + +/* + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles. + */ +struct amdxdna_cmd_chain { + uint32_t command_count; + uint32_t submit_index; + uint32_t error_index; + uint32_t reserved[3]; + uint64_t data[] __counted_by(command_count); +}; + +/* Exec buffer command header format */ +struct amdxdna_cmd { + union { + struct { + uint32_t state : 4; + uint32_t unused : 6; + uint32_t extra_cu_masks : 2; + uint32_t count : 11; + uint32_t opcode : 5; + uint32_t reserved : 4; + }; + uint32_t header; + }; + uint32_t data[] __counted_by(count); +}; + +// These packets are variable width but using this as a +// maximum size for now +#define PACKET_SIZE 64 + +int main(int argc, char **argv) { + int drv_fd; + int ret; + const char drv_path[] = "/dev/accel/accel0"; + std::string test_dir(argv[1]); + std::string inst_path = test_dir + "/add_one_insts.txt"; + std::string pdi_path_str = test_dir + "/add_one.pdi"; + const char *dpu_inst_path = inst_path.c_str(); + const char *pdi_path = pdi_path_str.c_str(); // Add one kernel + uint32_t heap_handle; + uint32_t major, minor; + + // open the driver + drv_fd = open(drv_path, O_RDWR); + + if (drv_fd < 0) { + printf("Error %i opening %s\n", drv_fd, drv_path); + return -1; + } + + printf("%s open\n", drv_path); + + // get driver version + if (get_driver_version(drv_fd, &major, &minor) < 0) { + printf("Error getting driver version\n"); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + printf("Driver version %u.%u\n", major, minor); + + ///////////////////////////////////////////////////////////////////////////////// + // Step 0: Allocate the necessary BOs. This includes: + // 1. The operands for the two kernels that will be launched + // 2. A heap which contains: + // a. A PDI for the design that will be run + // b. Instruction sequences for both runs + + // reserve some device memory for the heap + if (alloc_heap(drv_fd, 48 * 1024 * 1024, &heap_handle) < 0) { + perror("Error allocating device heap"); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t pdi_vaddr; + uint64_t pdi_sram_vaddr; + uint32_t pdi_handle; + printf("Loading pdi\n"); + ret = load_pdi(drv_fd, &pdi_vaddr, &pdi_sram_vaddr, &pdi_handle, pdi_path); + if (ret < 0) { + printf("Error %i loading pdi\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t dpu_0_vaddr; + uint64_t dpu_0_sram_vaddr; + uint32_t dpu_0_handle; + uint32_t num_dpu_0_insts; + printf("Loading dpu inst\n"); + ret = load_instructions(drv_fd, &dpu_0_vaddr, &dpu_0_sram_vaddr, + &dpu_0_handle, dpu_inst_path, &num_dpu_0_insts); + if (ret < 0) { + printf("Error %i loading dpu instructions\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t dpu_1_vaddr; + uint64_t dpu_1_sram_vaddr; + uint32_t dpu_1_handle; + uint32_t num_dpu_1_insts; + printf("Loading dpu inst\n"); + ret = load_instructions(drv_fd, &dpu_1_vaddr, &dpu_1_sram_vaddr, + &dpu_1_handle, dpu_inst_path, &num_dpu_1_insts); + if (ret < 0) { + printf("Error %i loading dpu instructions\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + printf("DPU 0 instructions @: %p\n", (void *)dpu_0_vaddr); + printf("DPU 1 instructions @: %p\n", (void *)dpu_1_vaddr); + printf("PDI file @: %p\n", (void *)pdi_vaddr); + printf("PDI handle @: %d\n", pdi_handle); + + uint64_t input_0; + uint64_t input_0_sram_vaddr; + uint32_t input_0_handle; + ret = create_dev_bo(drv_fd, &input_0, &input_0_sram_vaddr, &input_0_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &input_0, &input_0_sram_vaddr, + // &input_0_handle, DATA_BUFFER_SIZE); + printf("Input @: %p\n", (void *)input_0); + if (ret < 0) { + printf("Error %i creating data 0\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t output_0; + uint64_t output_0_sram_vaddr; + uint32_t output_0_handle; + ret = create_dev_bo(drv_fd, &output_0, &output_0_sram_vaddr, &output_0_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &output_0, &output_0_sram_vaddr, + // &output_0_handle, DATA_BUFFER_SIZE); + printf("Output @: %p\n", (void *)output_0); + if (ret < 0) { + printf("Error %i creating data 1\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t input_1; + uint64_t input_1_sram_vaddr; + uint32_t input_1_handle; + ret = create_dev_bo(drv_fd, &input_1, &input_1_sram_vaddr, &input_1_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &input_1, &input_1_sram_vaddr, + // &input_1_handle, DATA_BUFFER_SIZE); + printf("Input @: %p\n", (void *)input_1); + if (ret < 0) { + printf("Error %i creating data 0\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t output_1; + uint64_t output_1_sram_vaddr; + uint32_t output_1_handle; + ret = create_dev_bo(drv_fd, &output_1, &output_1_sram_vaddr, &output_1_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &output_1, &output_1_sram_vaddr, + // &output_1_handle, DATA_BUFFER_SIZE); + printf("Output @: %p\n", (void *)output_1); + if (ret < 0) { + printf("Error %i creating data 1\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + *((uint32_t *)input_0 + i) = i; + *((uint32_t *)input_1 + i) = i + 0xFEEDED1E; + *((uint32_t *)output_0 + i) = 0xDEFACE; + *((uint32_t *)output_1 + i) = 0xDEADBEEF; + } + + // Writing the user buffers + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + sync_bo(drv_fd, input_1_handle); + sync_bo(drv_fd, output_1_handle); + + // Performing a sync on the queue descriptor, completion signal, queue buffer + // and config cu bo. + sync_bo(drv_fd, dpu_0_handle); + sync_bo(drv_fd, dpu_1_handle); + sync_bo(drv_fd, pdi_handle); + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + + ///////////////////////////////////////////////////////////////////////////////// + // Step 1: Create a user mode queue + // This is going to be where we create a queue where we: + // 1. Create and configure a hardware context + // 2. Allocate the queue buffer as a user-mode queue + + // Allocating a structure to store QOS information + struct amdxdna_qos_info *qos = + (struct amdxdna_qos_info *)malloc(sizeof(struct amdxdna_qos_info)); + qos->gops = 0; + qos->fps = 0; + qos->dma_bandwidth = 0; + qos->latency = 0; + qos->frame_exec_time = 0; + qos->priority = 0; + + // This is the structure that we pass + struct amdxdna_drm_create_hwctx create_hw_ctx = { + .ext = 0, + .ext_flags = 0, + .qos_p = (uint64_t)qos, + .umq_bo = 0, + .log_buf_bo = 0, + .max_opc = 0x800, // Not sure what this is but this was the value used + .num_tiles = 4, + .mem_size = 0, + .umq_doorbell = 0, + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hw_ctx); + if (ret != 0) { + perror("Failed to create hwctx"); + return -1; + } + + // Creating a structure to configure the CU + struct amdxdna_cu_config cu_config = { + .cu_bo = pdi_handle, + .cu_func = 0, + }; + + // Creating a structure to configure the hardware context + struct amdxdna_hwctx_param_config_cu param_config_cu; + param_config_cu.num_cus = 1; + param_config_cu.cu_configs[0] = cu_config; + + printf("Size of param_config_cu: 0x%lx\n", sizeof(param_config_cu)); + + // Configuring the hardware context with the PDI + struct amdxdna_drm_config_hwctx config_hw_ctx = { + .handle = create_hw_ctx.handle, + .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU, + .param_val = + (uint64_t)¶m_config_cu, // Pass in the pointer to the param value + .param_val_size = 0x10, // Size of param config CU is 16B + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx); + if (ret != 0) { + perror("Failed to config hwctx"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 2: Configuring the CMD BOs with the different instruction sequences + struct amdxdna_drm_create_bo create_cmd_bo_0 = { + .type = AMDXDNA_BO_CMD, + .size = PACKET_SIZE, + }; + int cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_0); + if (cmd_bo_ret != 0) { + perror("Failed to create cmd_0"); + return -1; + } + + struct amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = { + .handle = create_cmd_bo_0.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_0_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + // Writing the first packet to the queue + struct amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap( + 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, + cmd_bo_0_get_bo_info.map_offset); + cmd_0->state = 1; // ERT_CMD_STATE_NEW; + cmd_0->extra_cu_masks = 0; + cmd_0->count = 0xF; // NOTE: For some reason this needs to be larger + cmd_0->opcode = 0x0; // ERT_START_CU; + cmd_0->data[0] = 0x3; // NOTE: This one seems to be skipped + cmd_0->data[1] = 0x3; // Transaction opcode + cmd_0->data[2] = 0x0; + cmd_0->data[3] = dpu_0_sram_vaddr; + cmd_0->data[4] = 0x0; + cmd_0->data[5] = 0x44; // Size of DPU instruction + cmd_0->data[6] = input_0 & 0xFFFFFFFF; // Input low + cmd_0->data[7] = (input_0 >> 32) & 0xFFFFFFFF; // Input high + cmd_0->data[8] = output_0 & 0xFFFFFFFF; // Output low + cmd_0->data[9] = (output_0 >> 32) & 0xFFFFFFFF; // Output high + + // Writing to the second packet of the queue + struct amdxdna_drm_create_bo create_cmd_bo_1 = { + .type = AMDXDNA_BO_CMD, + .size = PACKET_SIZE, + }; + cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_1); + if (cmd_bo_ret != 0) { + perror("Failed to create cmd_1"); + return -1; + } + + struct amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = { + .handle = create_cmd_bo_1.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_1_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + struct amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap( + 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, + cmd_bo_1_get_bo_info.map_offset); + cmd_1->state = 1; // ERT_CMD_STATE_NEW; + cmd_1->extra_cu_masks = 0; + cmd_1->count = 10; // Number of commands + cmd_1->opcode = 0x0; // ERT_START_CU; + cmd_1->data[0] = 0x3; // This one seems to be skipped + cmd_1->data[1] = 0x3; // Transaction opcode + cmd_1->data[2] = 0x0; + cmd_1->data[3] = dpu_1_sram_vaddr; + cmd_1->data[4] = 0x0; + cmd_1->data[5] = 0x44; // Size of DPU instruction + cmd_1->data[6] = input_1 & 0xFFFFFFFF; // Input low + cmd_1->data[7] = (input_1 >> 32) & 0xFFFFFFFF; // Input high + cmd_1->data[8] = output_1 & 0xFFFFFFFF; // Output low + cmd_1->data[9] = (output_1 >> 32) & 0xFFFFFFFF; // Output high + + ///////////////////////////////////////////////////////////////////////////////// + // Step 3: Submit commands -- This requires creating a BO_EXEC that contains + // the command chain that points to the instruction sequences just created + + // Allocate a command chain + void *bo_cmd_chain_buf = NULL; + cmd_bo_ret = posix_memalign(&bo_cmd_chain_buf, 4096, 4096); + if (cmd_bo_ret != 0 || bo_cmd_chain_buf == NULL) { + printf("[ERROR] Failed to allocate cmd_bo buffer of size %d\n", 4096); + } + + struct amdxdna_drm_create_bo create_cmd_chain_bo = { + .type = AMDXDNA_BO_CMD, + .size = 4096, + }; + cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_chain_bo); + if (cmd_bo_ret != 0) { + perror("Failed to create command chain BO"); + return -1; + } + + struct amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = { + .handle = create_cmd_chain_bo.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_chain_bo_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + struct amdxdna_cmd *cmd_chain = + (struct amdxdna_cmd *)mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, + drv_fd, cmd_chain_bo_get_bo_info.map_offset); + + // Writing information to the command buffer + struct amdxdna_cmd_chain *cmd_chain_payload = + (struct amdxdna_cmd_chain *)(cmd_chain->data); + cmd_chain->state = 1; // ERT_CMD_STATE_NEW; + cmd_chain->extra_cu_masks = 0; + cmd_chain->count = 0xA; // TODO: Why is this the value? + cmd_chain->opcode = 0x13; // ERT_CMD_CHAIN + cmd_chain_payload->command_count = 2; + cmd_chain_payload->submit_index = 0; + cmd_chain_payload->error_index = 0; + cmd_chain_payload->data[0] = create_cmd_bo_0.handle; + cmd_chain_payload->data[1] = create_cmd_bo_1.handle; + + // Reading the user buffers + sync_bo(drv_fd, create_cmd_chain_bo.handle); + sync_bo(drv_fd, create_cmd_bo_0.handle); + sync_bo(drv_fd, create_cmd_bo_1.handle); + + // Perform a submit cmd + uint32_t bo_args[6] = {dpu_0_handle, dpu_1_handle, input_0_handle, + output_0_handle, input_1_handle, output_1_handle}; + struct amdxdna_drm_exec_cmd exec_cmd_0 = { + .ext = 0, + .ext_flags = 0, + .hwctx = create_hw_ctx.handle, + .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF, + .cmd_handles = create_cmd_chain_bo.handle, + .args = (uint64_t)bo_args, + .cmd_count = 1, + .arg_count = sizeof(bo_args) / sizeof(uint32_t), + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, &exec_cmd_0); + if (ret != 0) { + perror("Failed to submit work"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 4: Wait for the output + // Use the wait IOCTL to wait for our submission to complete + struct amdxdna_drm_wait_cmd wait_cmd = { + .hwctx = create_hw_ctx.handle, + .timeout = 50, // 50ms timeout + .seq = exec_cmd_0.seq, + }; + + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd); + if (ret != 0) { + perror("Failed to wait"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 5: Verify output + + // Reading the user buffers + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + sync_bo(drv_fd, input_1_handle); + sync_bo(drv_fd, output_1_handle); + + int errors = 0; + printf("Checking run 0:\n"); + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + uint32_t src = *((uint32_t *)input_0 + i); + uint32_t dst = *((uint32_t *)output_0 + i); + // printf("src: 0x%x\n", src); + // printf("dst: 0x%x\n", dst); + if (src + 1 != dst) { + printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); + errors++; + } + } + + printf("Checking run 1:\n"); + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + uint32_t src = *((uint32_t *)input_1 + i); + uint32_t dst = *((uint32_t *)output_1 + i); + // printf("src: 0x%x\n", src); + // printf("dst: 0x%x\n", dst); + if (src + 1 != dst) { + printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); + errors++; + } + } + + if (!errors) { + printf("PASS!\n"); + } else { + printf("FAIL! %d/2048\n", errors); + } + + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return 0; +} \ No newline at end of file diff --git a/experimental/hsa/aie_hsa_dispatch_test.cc b/experimental/hsa/aie_hsa_dispatch_test.cc index 799a060df..fca55e4a4 100644 --- a/experimental/hsa/aie_hsa_dispatch_test.cc +++ b/experimental/hsa/aie_hsa_dispatch_test.cc @@ -305,5 +305,5 @@ int main(int argc, char **argv) { r = hsa_shut_down(); assert(r == HSA_STATUS_SUCCESS); - std::cout << "PASS"; + std::cout << "PASS\n"; } diff --git a/experimental/hsa/amdxdna_accel.h b/experimental/hsa/amdxdna_accel.h new file mode 100644 index 000000000..048fd3fe6 --- /dev/null +++ b/experimental/hsa/amdxdna_accel.h @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: NCSA OR GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2024, Advanced Micro Devices, Inc. + */ + +#ifndef AMDXDNA_ACCEL_H_ +#define AMDXDNA_ACCEL_H_ + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef __counted_by +#define __counted_by(cnt) +#endif + +#define AMDXDNA_DRIVER_MAJOR 1 +#define AMDXDNA_DRIVER_MINOR 0 + +#define AMDXDNA_INVALID_CMD_HANDLE (~0UL) +#define AMDXDNA_INVALID_ADDR (~0UL) +#define AMDXDNA_INVALID_CTX_HANDLE 0 +#define AMDXDNA_INVALID_BO_HANDLE 0 + +/* + * The interface can grow/extend over time. + * On each struct amdxdna_drm_*, to support potential extension, we defined it + * like this. + * + * Example code: + * + * struct amdxdna_drm_example_data { + * .ext = (uintptr_t)&example_data_ext; + * ... + * }; + * + * We don't have extension now. The extension struct will define in the future. + */ + +enum amdxdna_drm_ioctl_id { + DRM_AMDXDNA_CREATE_HWCTX, + DRM_AMDXDNA_DESTROY_HWCTX, + DRM_AMDXDNA_CONFIG_HWCTX, + DRM_AMDXDNA_CREATE_BO, + DRM_AMDXDNA_GET_BO_INFO, + DRM_AMDXDNA_SYNC_BO, + DRM_AMDXDNA_EXEC_CMD, + DRM_AMDXDNA_WAIT_CMD, + DRM_AMDXDNA_GET_INFO, + DRM_AMDXDNA_SET_STATE, + DRM_AMDXDNA_NUM_IOCTLS +}; + +enum amdxdna_device_type { + AMDXDNA_DEV_TYPE_UNKNOWN = -1, + AMDXDNA_DEV_TYPE_KMQ, + AMDXDNA_DEV_TYPE_UMQ, +}; + +/** + * struct qos_info - QoS information for driver. + * @gops: Giga operations per second. + * @fps: Frames per second. + * @dma_bandwidth: DMA bandwidtha. + * @latency: Frame response latency. + * @frame_exec_time: Frame execution time. + * @priority: Request priority. + * + * User program can provide QoS hints to driver. + */ +struct amdxdna_qos_info { + __u32 gops; + __u32 fps; + __u32 dma_bandwidth; + __u32 latency; + __u32 frame_exec_time; + __u32 priority; +}; + +/** + * struct amdxdna_drm_create_hwctx - Create hardware context. + * @ext: MBZ. + * @ext_flags: MBZ. + * @qos_p: Address of QoS info. + * @umq_bo: BO handle for user mode queue(UMQ). + * @log_buf_bo: BO handle for log buffer. + * @max_opc: Maximum operations per cycle. + * @num_tiles: Number of AIE tiles. + * @mem_size: Size of AIE tile memory. + * @umq_doorbell: Returned offset of doorbell associated with UMQ. + * @handle: Returned hardware context handle. + */ +struct amdxdna_drm_create_hwctx { + __u64 ext; + __u64 ext_flags; + __u64 qos_p; + __u32 umq_bo; + __u32 log_buf_bo; + __u32 max_opc; + __u32 num_tiles; + __u32 mem_size; + __u32 umq_doorbell; + __u32 handle; +}; + +/** + * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. + * @handle: Hardware context handle. + * @pad: MBZ. + */ +struct amdxdna_drm_destroy_hwctx { + __u32 handle; + __u32 pad; +}; + +/** + * struct amdxdna_cu_config - configuration for one CU + * @cu_bo: CU configuration buffer bo handle + * @cu_func: Functional of a CU + * @pad: MBZ + */ +struct amdxdna_cu_config { + __u32 cu_bo; + __u8 cu_func; + __u8 pad[3]; +}; + +/** + * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware + * context + * @num_cus: Number of CUs to configure + * @pad: MBZ + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + */ +struct amdxdna_hwctx_param_config_cu { + __u16 num_cus; + __u16 pad[3]; + struct amdxdna_cu_config cu_configs[] __counted_by(num_cus); +}; + +enum amdxdna_drm_config_hwctx_param { + DRM_AMDXDNA_HWCTX_CONFIG_CU, + DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + DRM_AMDXDNA_HWCTX_CONFIG_NUM +}; + +/** + * struct amdxdna_drm_config_hwctx - Configure hardware context. + * @handle: hardware context handle. + * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the + * structure passed in via param_val. + * @param_val: A structure specified by the param_type struct member. + * @param_val_size: Size of the parameter buffer pointed to by the param_val. + * If param_val is not a pointer, driver can ignore this. + * + * Note: if the param_val is a pointer pointing to a buffer, the maximum size + * of the buffer is 4KiB(PAGE_SIZE). + */ +struct amdxdna_drm_config_hwctx { + __u32 handle; + __u32 param_type; + __u64 param_val; + __u32 param_val_size; + __u32 pad; +}; + +/* + * AMDXDNA_BO_SHMEM: DRM GEM SHMEM bo + * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory + * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP + * AMDXDNA_BO_CMD: User and driver accessible bo + * AMDXDNA_BO_DMA: DRM GEM DMA bo + */ +enum amdxdna_bo_type { + AMDXDNA_BO_INVALID = 0, + AMDXDNA_BO_SHMEM, + AMDXDNA_BO_DEV_HEAP, + AMDXDNA_BO_DEV, + AMDXDNA_BO_CMD, + AMDXDNA_BO_DMA, +}; + +/** + * struct amdxdna_drm_create_bo - Create a buffer object. + * @flags: Buffer flags. MBZ. + * @type: Buffer type. + * @vaddr: User VA of buffer if applied. MBZ. + * @size: Size in bytes. + * @handle: Returned DRM buffer object handle. + */ +struct amdxdna_drm_create_bo { + __u64 flags; + __u32 type; + __u32 _pad; + __u64 vaddr; + __u64 size; + __u32 handle; +}; + +/** + * struct amdxdna_drm_get_bo_info - Get buffer object information. + * @ext: MBZ. + * @ext_flags: MBZ. + * @handle: DRM buffer object handle. + * @map_offset: Returned DRM fake offset for mmap(). + * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). + * @xdna_addr: Returned XDNA device virtual address. + */ +struct amdxdna_drm_get_bo_info { + __u64 ext; + __u64 ext_flags; + __u32 handle; + __u32 _pad; + __u64 map_offset; + __u64 vaddr; + __u64 xdna_addr; +}; + +/** + * struct amdxdna_drm_sync_bo - Sync buffer object. + * @handle: Buffer object handle. + * @direction: Direction of sync, can be from device or to device. + * @offset: Offset in the buffer to sync. + * @size: Size in bytes. + */ +struct amdxdna_drm_sync_bo { + __u32 handle; +#define SYNC_DIRECT_TO_DEVICE 0U +#define SYNC_DIRECT_FROM_DEVICE 1U + __u32 direction; + __u64 offset; + __u64 size; +}; + +enum amdxdna_cmd_type { + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0, + AMDXDNA_CMD_SUBMIT_DEPENDENCY, + AMDXDNA_CMD_SUBMIT_SIGNAL, +}; + +/** + * struct amdxdna_drm_exec_cmd - Execute command. + * @ext: MBZ. + * @ext_flags: MBZ. + * @hwctx: Hardware context handle. + * @type: One of command type in enum amdxdna_cmd_type. + * @cmd_handles: Array of command handles or the command handle itself in case + * of just one. + * @args: Array of arguments for all command handles. + * @cmd_count: Number of command handles in the cmd_handles array. + * @arg_count: Number of arguments in the args array. + * @seq: Returned sequence number for this command. + */ +struct amdxdna_drm_exec_cmd { + __u64 ext; + __u64 ext_flags; + __u32 hwctx; + __u32 type; + __u64 cmd_handles; + __u64 args; + __u32 cmd_count; + __u32 arg_count; + __u64 seq; +}; + +/** + * struct amdxdna_drm_wait_cmd - Wait exectuion command. + * + * @hwctx: hardware context handle. + * @timeout: timeout in ms, 0 implies infinite wait. + * @seq: sequence number of the command returned by execute command. + * + * Wait a command specified by seq to be completed. + * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot + * to submit a new command. + */ +struct amdxdna_drm_wait_cmd { + __u32 hwctx; + __u32 timeout; + __u64 seq; +}; + +/** + * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware + * @buffer: The user space buffer that will return the AIE status + * @buffer_size: The size of the user space buffer + * @cols_filled: A bitmap of AIE columns whose data has been returned in the + * buffer. + */ +struct amdxdna_drm_query_aie_status { + __u64 buffer; /* out */ + __u32 buffer_size; /* in */ + __u32 cols_filled; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware + * @major: The major version number + * @minor: The minor version number + */ +struct amdxdna_drm_query_aie_version { + __u32 major; /* out */ + __u32 minor; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile + * (core, mem, shim) + * @row_count: The number of rows. + * @row_start: The starting row number. + * @dma_channel_count: The number of dma channels. + * @lock_count: The number of locks. + * @event_reg_count: The number of events. + * @pad: MBZ. + */ +struct amdxdna_drm_query_aie_tile_metadata { + __u16 row_count; + __u16 row_start; + __u16 dma_channel_count; + __u16 lock_count; + __u16 event_reg_count; + __u16 pad[3]; +}; + +/** + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE + * hardware + * @col_size: The size of a column in bytes. + * @cols: The total number of columns. + * @rows: The total number of rows. + * @version: The version of the AIE hardware. + * @core: The metadata for all core tiles. + * @mem: The metadata for all mem tiles. + * @shim: The metadata for all shim tiles. + */ +struct amdxdna_drm_query_aie_metadata { + __u32 col_size; + __u16 cols; + __u16 rows; + struct amdxdna_drm_query_aie_version version; + struct amdxdna_drm_query_aie_tile_metadata core; + struct amdxdna_drm_query_aie_tile_metadata mem; + struct amdxdna_drm_query_aie_tile_metadata shim; +}; + +/** + * struct amdxdna_drm_query_clock - Metadata for a clock + * @name: The clock name. + * @freq_mhz: The clock frequency. + * @pad: MBZ. + */ +struct amdxdna_drm_query_clock { + __u8 name[16]; + __u32 freq_mhz; + __u32 pad; +}; + +/** + * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks + * @mp_npu_clock: The metadata for MP-NPU clock. + * @h_clock: The metadata for H clock. + */ +struct amdxdna_drm_query_clock_metadata { + struct amdxdna_drm_query_clock mp_npu_clock; + struct amdxdna_drm_query_clock h_clock; +}; + +enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; + +/** + * struct amdxdna_drm_query_sensor - The data for single sensor. + * @label: The name for a sensor. + * @input: The current value of the sensor. + * @max: The maximum value possible for the sensor. + * @average: The average value of the sensor. + * @highest: The highest recorded sensor value for this driver load for the + * sensor. + * @status: The sensor status. + * @units: The sensor units. + * @unitm: Translates value member variables into the correct unit via (pow(10, + * unitm) * value) + * @type: The sensor type from enum amdxdna_sensor_type + * @pad: MBZ. + */ +struct amdxdna_drm_query_sensor { + __u8 label[64]; + __u32 input; + __u32 max; + __u32 average; + __u32 highest; + __u8 status[64]; + __u8 units[16]; + __s8 unitm; + __u8 type; + __u8 pad[6]; +}; + +/** + * struct amdxdna_drm_query_hwctx - The data for single context. + * @context_id: The ID for this context. + * @start_col: The starting column for the partition assigned to this context. + * @num_col: The number of columns in the partition assigned to this context. + * @pid: The Process ID of the process that created this context. + * @command_submissions: The number of commands submitted to this context. + * @command_completions: The number of commands completed by this context. + * @migrations: The number of times this context has been moved to a different + * partition. + * @preemptions: The number of times this context has been preempted by another + * context in the same partition. + * @pad: MBZ. + */ +struct amdxdna_drm_query_hwctx { + __u32 context_id; + __u32 start_col; + __u32 num_col; + __u32 pad; + __s64 pid; + __u64 command_submissions; + __u64 command_completions; + __u64 migrations; + __u64 preemptions; + __u64 errors; +}; + +/** + * struct amdxdna_drm_aie_mem - The data for AIE memory read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE memory address to read/write + * @size: The size of bytes to read/write + * @buf_p: The buffer to store read/write data + * + * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM + * parameters. + */ +struct amdxdna_drm_aie_mem { + __u32 col; + __u32 row; + __u32 addr; + __u32 size; + __u64 buf_p; +}; + +/** + * struct amdxdna_drm_aie_reg - The data for AIE register read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE register address to read/write + * @val: The value to write or returned value from AIE + * + * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG + * parameters. + */ +struct amdxdna_drm_aie_reg { + __u32 col; + __u32 row; + __u32 addr; + __u32 val; +}; + +enum amdxdna_drm_get_param { + DRM_AMDXDNA_QUERY_AIE_STATUS, + DRM_AMDXDNA_QUERY_AIE_METADATA, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_CLOCK_METADATA, + DRM_AMDXDNA_QUERY_SENSORS, + DRM_AMDXDNA_QUERY_HW_CONTEXTS, + DRM_AMDXDNA_READ_AIE_MEM, + DRM_AMDXDNA_READ_AIE_REG, + DRM_AMDXDNA_NUM_GET_PARAM, +}; + +/** + * struct amdxdna_drm_get_info - Get some information from the AIE hardware. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. Size needed/written by the kernel. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_get_info { + __u32 param; /* in */ + __u32 buffer_size; /* in/out */ + __u64 buffer; /* in/out */ +}; + +enum amdxdna_power_mode_type { + XRT_POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */ + XRT_POWER_MODE_LOW, /**< Set frequency to lowest DPM */ + XRT_POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ + XRT_POWER_MODE_HIGH, /**< Set frequency to highest DPM */ +}; + +/** + * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_set_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +enum amdxdna_drm_set_param { + DRM_AMDXDNA_SET_POWER_MODE, + DRM_AMDXDNA_WRITE_AIE_MEM, + DRM_AMDXDNA_WRITE_AIE_REG, + DRM_AMDXDNA_NUM_SET_PARAM, +}; + +/** + * struct amdxdna_drm_set_state - Set the state of some component within the AIE + * hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_set_state { + __u32 param; /* in */ + __u32 buffer_size; /* in */ + __u64 buffer; /* in */ +}; + +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ + struct amdxdna_drm_create_hwctx) + +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ + struct amdxdna_drm_destroy_hwctx) + +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ + struct amdxdna_drm_config_hwctx) + +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ + struct amdxdna_drm_create_bo) + +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ + struct amdxdna_drm_get_bo_info) + +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) + +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) + +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) + +#define DRM_IOCTL_AMDXDNA_GET_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) + +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ + struct amdxdna_drm_set_state) + +#if defined(__cplusplus) +} /* extern c end */ +#endif + +#endif /* AMDXDNA_ACCEL_H_ */ diff --git a/experimental/hsa/hsa_ipu.h b/experimental/hsa/hsa_ipu.h new file mode 100644 index 000000000..31176a34f --- /dev/null +++ b/experimental/hsa/hsa_ipu.h @@ -0,0 +1,270 @@ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "amdxdna_accel.h" + +// want to mmap the file +#include +#include + +#define MAX_NUM_INSTRUCTIONS 1024 // Maximum number of dpu or pdi instructions. + +// Dummy packet defines + +int map_doorbell(int fd, uint64_t *doorbell) { + // Mmap the mailbox. + int32_t page_size = 4096; + *doorbell = (uint64_t)mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (doorbell != MAP_FAILED) { + printf("Doorbell mapped\n"); + return 0; + } + + printf("[ERROR] doorbell mmap failed: %s\n", strerror(errno)); + return errno; +} + +void ring_doorbell(uint64_t doorbell) { + int32_t curr_tail = *((int32_t *)doorbell); + *((uint32_t *)doorbell) = curr_tail + 0x94; +} + +int get_driver_version(int fd, __u32 *major, __u32 *minor) { + int ret; + struct amdxdna_drm_query_aie_version version; + + struct amdxdna_drm_get_info info_params = { + .param = DRM_AMDXDNA_QUERY_AIE_VERSION, + .buffer_size = sizeof(version), + .buffer = (__u64)&version, + }; + + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_INFO, &info_params); + if (ret == 0) { + *major = version.major; + *minor = version.minor; + } + + return ret; +} + +/* + Allocates a heap on the device by creating a BO of type dev heap +*/ +static int alloc_heap(int fd, __u32 size, __u32 *handle) { + int ret; + void *heap_buf = NULL; + const size_t alignment = 64 * 1024 * 1024; + ret = posix_memalign(&heap_buf, alignment, size); + if (ret != 0 || heap_buf == NULL) { + printf("[ERROR] Failed to allocate heap buffer of size %d\n", size); + } + + void *dev_heap_parent = mmap(0, alignment * 2 - 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (dev_heap_parent == MAP_FAILED) { + dev_heap_parent = nullptr; + return -1; + } + + struct amdxdna_drm_create_bo create_bo_params = { + .type = AMDXDNA_BO_DEV_HEAP, + .size = size, + }; + + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_params); + if (ret == 0 && handle) { + *handle = create_bo_params.handle; + } + + struct amdxdna_drm_get_bo_info get_bo_info = {.handle = + create_bo_params.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + // Need to free the heap buf but still use the address so we can + // ensure alignment + free(heap_buf); + heap_buf = (void *)mmap(heap_buf, size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, get_bo_info.map_offset); + printf("Heap buffer @: %p\n", heap_buf); + + return ret; +} + +/* + Creates a dev bo which is carved out of the heap bo. +*/ +static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, + __u32 *handle, __u64 size_in_bytes) { + struct amdxdna_drm_create_bo create_bo = { + .type = AMDXDNA_BO_DEV, + .size = size_in_bytes, + }; + int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo); + if (ret != 0) { + perror("Failed to create BO"); + return -1; + } + + struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + *vaddr = get_bo_info.vaddr; + *sram_vaddr = get_bo_info.xdna_addr; + *handle = create_bo.handle; + return 0; +} + +/* + Creates a shmem bo +*/ +static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, + __u32 *handle, __u64 size_in_bytes) { + const size_t alignment = 64 * 1024 * 1024; + void *shmem_create = NULL; + int ret = posix_memalign(&shmem_create, alignment, size_in_bytes); + if (ret != 0) { + printf("[ERROR] Failed to allocate shmem bo of size %lld\n", size_in_bytes); + } + + // Touching buffer to map page + *(uint32_t *)shmem_create = 0xDEADBEEF; + + printf("Shmem BO @: %p\n", shmem_create); + + struct amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM, + .vaddr = (__u64)shmem_create, + .size = size_in_bytes}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo); + if (ret != 0) { + perror("Failed to create BO"); + return -1; + } + + struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + *vaddr = (__u64)shmem_create; + *sram_vaddr = get_bo_info.xdna_addr; + *handle = create_bo.handle; + return 0; +} + +/* + Wrapper around synch bo ioctl. +*/ +static int sync_bo(int fd, __u32 handle) { + struct amdxdna_drm_sync_bo sync_params = { + .handle = handle, + }; + int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params); + if (ret != 0) { + printf("Synch bo ioctl failed for handle %d\n", handle); + } + return ret; +} + +/* + Create a BO_DEV and populate it with a PDI +*/ + +static int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle, + const char *path) { + FILE *file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open instructions file."); + return -1; + } + + fseek(file, 0L, SEEK_END); + ssize_t file_size = ftell(file); + fseek(file, 0L, SEEK_SET); + + printf("Pdi file size: %ld\n", file_size); + + fclose(file); + + // Mmaping the file + int pdi_fd = open(path, O_RDONLY); + uint64_t *file_data = + (uint64_t *)mmap(0, file_size, PROT_READ, MAP_PRIVATE, pdi_fd, 0); + + // Creating a BO_DEV bo to store the pdi file. + int ret = create_dev_bo(fd, vaddr, sram_addr, handle, file_size); + if (ret != 0) { + perror("Failed to create pdi BO"); + return -1; + } + + // copy the file into Bo dev + uint64_t *bo = (uint64_t *)*vaddr; + memcpy(bo, file_data, file_size); + + close(pdi_fd); + return 0; +} + +/* + Create a BO DEV and populate it with instructions whose virtual address is + passed to the driver via an HSA packet. +*/ +static int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr, + __u32 *handle, const char *path, __u32 *num_inst) { + // read dpu instructions into an array + FILE *file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open instructions file."); + return -1; + } + + char *line = NULL; + size_t len = 0; + __u32 inst_array[MAX_NUM_INSTRUCTIONS]; + __u32 inst_counter = 0; + while (getline(&line, &len, file) != -1) { + inst_array[inst_counter++] = strtoul(line, NULL, 16); + if (inst_counter >= MAX_NUM_INSTRUCTIONS) { + perror("Instruction array overflowed."); + return -2; + } + } + fclose(file); + + // Creating a BO_DEV bo to store the instruction. + int ret = + create_dev_bo(fd, vaddr, sram_addr, handle, inst_counter * sizeof(__u32)); + if (ret != 0) { + perror("Failed to create dpu BO"); + return -3; + } + + *num_inst = inst_counter; + + memcpy((__u32 *)*vaddr, inst_array, inst_counter * sizeof(__u32)); + return ret; +} From 0aab102b2c766a17be892ec39515ab5a7a5a472e Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 27 Aug 2024 22:22:03 -0500 Subject: [PATCH 3/3] cleanup --- .github/workflows/ci-linux.yml | 26 ++++++- experimental/CMakeLists.txt | 4 + experimental/hsa/aie_hsa_bare_add_one.cc | 98 ++++++++---------------- experimental/hsa/hsa_ipu.h | 53 ++++++------- 4 files changed, 84 insertions(+), 97 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 5071219ab..835151081 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -201,11 +201,13 @@ jobs: print_ir_aie2xclbin_results \ $PWD/llvm-aie - test_hsa: + test_hsa_full: name: HSA MWE strategy: fail-fast: false - runs-on: linux-phoenix-20240606 + matrix: + runs-on: [linux-phoenix-20240606] + runs-on: ${{ matrix.runs-on }} steps: - name: "Checking out repository" # for test scripts uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 @@ -219,6 +221,24 @@ jobs: cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja ninja aie_hsa_dispatch_test ./aie_hsa_dispatch_test $PWD/.. - + + test_hsa_bare: + name: HSA MWE + strategy: + fail-fast: false + matrix: + runs-on: [linux-phoenix-20240819] + runs-on: ${{ matrix.runs-on }} + steps: + - name: "Checking out repository" # for test scripts + uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + with: + submodules: false # not required for testbench + + - name: Build and run HSA example + run: | + cd experimental/hsa + mkdir build && pushd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja ninja aie_hsa_bare_add_one ./aie_hsa_bare_add_one $PWD/.. diff --git a/experimental/CMakeLists.txt b/experimental/CMakeLists.txt index dee0aab59..0f53224b1 100644 --- a/experimental/CMakeLists.txt +++ b/experimental/CMakeLists.txt @@ -5,3 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception add_subdirectory(delegate) +find_package(hsa-runtime64 QUIET) +if(${hsa-runtime64_FOUND}) + add_subdirectory(hsa) +endif() \ No newline at end of file diff --git a/experimental/hsa/aie_hsa_bare_add_one.cc b/experimental/hsa/aie_hsa_bare_add_one.cc index f5843d7e9..ad41567b0 100644 --- a/experimental/hsa/aie_hsa_bare_add_one.cc +++ b/experimental/hsa/aie_hsa_bare_add_one.cc @@ -1,39 +1,12 @@ - -/* - -RUN: (add_one_test %S) | FileCheck %s -CHECK: /dev/accel/accel0 open -CHECK: Driver version 1.1 -CHECK: Heap buffer @: 0x7f313c000000 -CHECK: Loading pdi -CHECK: Pdi file size: 3552 -CHECK: Loading dpu inst -CHECK: Loading dpu inst -CHECK: DPU 0 instructions @: 0x7f313c008000 -CHECK: DPU 1 instructions @: 0x7f313c010000 -CHECK: PDI file @: 0x7f313c000000 -CHECK: PDI handle @: 2 -CHECK: Input @: 0x7f313c018000 -CHECK: Output @: 0x7f313c020000 -CHECK: Input @: 0x7f313c028000 -CHECK: Output @: 0x7f313c030000 -CHECK: Size of param_config_cu: 0x8 -CHECK: Synch bo ioctl failed for handle 11 -CHECK: Synch bo ioctl failed for handle 9 -CHECK: Synch bo ioctl failed for handle 10 -CHECK: Checking run 0: -CHECK: Checking run 1: -CHECK: PASS! -CHECK: Closing -CHECK: Done - - */ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include #include -#include #include -#include #include "amdxdna_accel.h" #include "hsa_ipu.h" @@ -174,8 +147,6 @@ int main(int argc, char **argv) { uint32_t input_0_handle; ret = create_dev_bo(drv_fd, &input_0, &input_0_sram_vaddr, &input_0_handle, DATA_BUFFER_SIZE); - // ret = create_shmem_bo(drv_fd, &input_0, &input_0_sram_vaddr, - // &input_0_handle, DATA_BUFFER_SIZE); printf("Input @: %p\n", (void *)input_0); if (ret < 0) { printf("Error %i creating data 0\n", ret); @@ -190,8 +161,6 @@ int main(int argc, char **argv) { uint32_t output_0_handle; ret = create_dev_bo(drv_fd, &output_0, &output_0_sram_vaddr, &output_0_handle, DATA_BUFFER_SIZE); - // ret = create_shmem_bo(drv_fd, &output_0, &output_0_sram_vaddr, - // &output_0_handle, DATA_BUFFER_SIZE); printf("Output @: %p\n", (void *)output_0); if (ret < 0) { printf("Error %i creating data 1\n", ret); @@ -206,8 +175,6 @@ int main(int argc, char **argv) { uint32_t input_1_handle; ret = create_dev_bo(drv_fd, &input_1, &input_1_sram_vaddr, &input_1_handle, DATA_BUFFER_SIZE); - // ret = create_shmem_bo(drv_fd, &input_1, &input_1_sram_vaddr, - // &input_1_handle, DATA_BUFFER_SIZE); printf("Input @: %p\n", (void *)input_1); if (ret < 0) { printf("Error %i creating data 0\n", ret); @@ -222,8 +189,6 @@ int main(int argc, char **argv) { uint32_t output_1_handle; ret = create_dev_bo(drv_fd, &output_1, &output_1_sram_vaddr, &output_1_handle, DATA_BUFFER_SIZE); - // ret = create_shmem_bo(drv_fd, &output_1, &output_1_sram_vaddr, - // &output_1_handle, DATA_BUFFER_SIZE); printf("Output @: %p\n", (void *)output_1); if (ret < 0) { printf("Error %i creating data 1\n", ret); @@ -261,7 +226,7 @@ int main(int argc, char **argv) { // 2. Allocate the queue buffer as a user-mode queue // Allocating a structure to store QOS information - struct amdxdna_qos_info *qos = + amdxdna_qos_info *qos = (struct amdxdna_qos_info *)malloc(sizeof(struct amdxdna_qos_info)); qos->gops = 0; qos->fps = 0; @@ -271,7 +236,7 @@ int main(int argc, char **argv) { qos->priority = 0; // This is the structure that we pass - struct amdxdna_drm_create_hwctx create_hw_ctx = { + amdxdna_drm_create_hwctx create_hw_ctx = { .ext = 0, .ext_flags = 0, .qos_p = (uint64_t)qos, @@ -289,25 +254,26 @@ int main(int argc, char **argv) { } // Creating a structure to configure the CU - struct amdxdna_cu_config cu_config = { + amdxdna_cu_config cu_config = { .cu_bo = pdi_handle, .cu_func = 0, }; // Creating a structure to configure the hardware context - struct amdxdna_hwctx_param_config_cu param_config_cu; + amdxdna_hwctx_param_config_cu param_config_cu; param_config_cu.num_cus = 1; param_config_cu.cu_configs[0] = cu_config; printf("Size of param_config_cu: 0x%lx\n", sizeof(param_config_cu)); // Configuring the hardware context with the PDI - struct amdxdna_drm_config_hwctx config_hw_ctx = { + amdxdna_drm_config_hwctx config_hw_ctx = { .handle = create_hw_ctx.handle, .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU, - .param_val = - (uint64_t)¶m_config_cu, // Pass in the pointer to the param value - .param_val_size = 0x10, // Size of param config CU is 16B + // Pass in the pointer to the param value + .param_val = (uint64_t)¶m_config_cu, + // Size of param config CU is 16B + .param_val_size = 0x10, }; ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx); if (ret != 0) { @@ -317,7 +283,7 @@ int main(int argc, char **argv) { ///////////////////////////////////////////////////////////////////////////////// // Step 2: Configuring the CMD BOs with the different instruction sequences - struct amdxdna_drm_create_bo create_cmd_bo_0 = { + amdxdna_drm_create_bo create_cmd_bo_0 = { .type = AMDXDNA_BO_CMD, .size = PACKET_SIZE, }; @@ -327,8 +293,8 @@ int main(int argc, char **argv) { return -1; } - struct amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = { - .handle = create_cmd_bo_0.handle}; + amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = {.handle = + create_cmd_bo_0.handle}; ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_0_get_bo_info); if (ret != 0) { perror("Failed to get cmd BO 0 info"); @@ -336,7 +302,7 @@ int main(int argc, char **argv) { } // Writing the first packet to the queue - struct amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap( + amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap( 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, cmd_bo_0_get_bo_info.map_offset); cmd_0->state = 1; // ERT_CMD_STATE_NEW; @@ -355,7 +321,7 @@ int main(int argc, char **argv) { cmd_0->data[9] = (output_0 >> 32) & 0xFFFFFFFF; // Output high // Writing to the second packet of the queue - struct amdxdna_drm_create_bo create_cmd_bo_1 = { + amdxdna_drm_create_bo create_cmd_bo_1 = { .type = AMDXDNA_BO_CMD, .size = PACKET_SIZE, }; @@ -365,15 +331,15 @@ int main(int argc, char **argv) { return -1; } - struct amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = { - .handle = create_cmd_bo_1.handle}; + amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = {.handle = + create_cmd_bo_1.handle}; ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_1_get_bo_info); if (ret != 0) { perror("Failed to get cmd BO 0 info"); return -2; } - struct amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap( + amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap( 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, cmd_bo_1_get_bo_info.map_offset); cmd_1->state = 1; // ERT_CMD_STATE_NEW; @@ -396,13 +362,13 @@ int main(int argc, char **argv) { // the command chain that points to the instruction sequences just created // Allocate a command chain - void *bo_cmd_chain_buf = NULL; + void *bo_cmd_chain_buf = nullptr; cmd_bo_ret = posix_memalign(&bo_cmd_chain_buf, 4096, 4096); - if (cmd_bo_ret != 0 || bo_cmd_chain_buf == NULL) { + if (cmd_bo_ret != 0 || bo_cmd_chain_buf == nullptr) { printf("[ERROR] Failed to allocate cmd_bo buffer of size %d\n", 4096); } - struct amdxdna_drm_create_bo create_cmd_chain_bo = { + amdxdna_drm_create_bo create_cmd_chain_bo = { .type = AMDXDNA_BO_CMD, .size = 4096, }; @@ -412,7 +378,7 @@ int main(int argc, char **argv) { return -1; } - struct amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = { + amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = { .handle = create_cmd_chain_bo.handle}; ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_chain_bo_get_bo_info); if (ret != 0) { @@ -420,12 +386,12 @@ int main(int argc, char **argv) { return -2; } - struct amdxdna_cmd *cmd_chain = + amdxdna_cmd *cmd_chain = (struct amdxdna_cmd *)mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, cmd_chain_bo_get_bo_info.map_offset); // Writing information to the command buffer - struct amdxdna_cmd_chain *cmd_chain_payload = + amdxdna_cmd_chain *cmd_chain_payload = (struct amdxdna_cmd_chain *)(cmd_chain->data); cmd_chain->state = 1; // ERT_CMD_STATE_NEW; cmd_chain->extra_cu_masks = 0; @@ -445,7 +411,7 @@ int main(int argc, char **argv) { // Perform a submit cmd uint32_t bo_args[6] = {dpu_0_handle, dpu_1_handle, input_0_handle, output_0_handle, input_1_handle, output_1_handle}; - struct amdxdna_drm_exec_cmd exec_cmd_0 = { + amdxdna_drm_exec_cmd exec_cmd_0 = { .ext = 0, .ext_flags = 0, .hwctx = create_hw_ctx.handle, @@ -464,7 +430,7 @@ int main(int argc, char **argv) { ///////////////////////////////////////////////////////////////////////////////// // Step 4: Wait for the output // Use the wait IOCTL to wait for our submission to complete - struct amdxdna_drm_wait_cmd wait_cmd = { + amdxdna_drm_wait_cmd wait_cmd = { .hwctx = create_hw_ctx.handle, .timeout = 50, // 50ms timeout .seq = exec_cmd_0.seq, @@ -490,8 +456,6 @@ int main(int argc, char **argv) { for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { uint32_t src = *((uint32_t *)input_0 + i); uint32_t dst = *((uint32_t *)output_0 + i); - // printf("src: 0x%x\n", src); - // printf("dst: 0x%x\n", dst); if (src + 1 != dst) { printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); errors++; @@ -502,8 +466,6 @@ int main(int argc, char **argv) { for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { uint32_t src = *((uint32_t *)input_1 + i); uint32_t dst = *((uint32_t *)output_1 + i); - // printf("src: 0x%x\n", src); - // printf("dst: 0x%x\n", dst); if (src + 1 != dst) { printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); errors++; diff --git a/experimental/hsa/hsa_ipu.h b/experimental/hsa/hsa_ipu.h index 31176a34f..4ba2505e6 100644 --- a/experimental/hsa/hsa_ipu.h +++ b/experimental/hsa/hsa_ipu.h @@ -1,3 +1,9 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#pragma once #include #include @@ -15,8 +21,6 @@ #include "amdxdna_accel.h" // want to mmap the file -#include -#include #define MAX_NUM_INSTRUCTIONS 1024 // Maximum number of dpu or pdi instructions. @@ -43,9 +47,9 @@ void ring_doorbell(uint64_t doorbell) { int get_driver_version(int fd, __u32 *major, __u32 *minor) { int ret; - struct amdxdna_drm_query_aie_version version; + amdxdna_drm_query_aie_version version; - struct amdxdna_drm_get_info info_params = { + amdxdna_drm_get_info info_params = { .param = DRM_AMDXDNA_QUERY_AIE_VERSION, .buffer_size = sizeof(version), .buffer = (__u64)&version, @@ -63,7 +67,7 @@ int get_driver_version(int fd, __u32 *major, __u32 *minor) { /* Allocates a heap on the device by creating a BO of type dev heap */ -static int alloc_heap(int fd, __u32 size, __u32 *handle) { +int alloc_heap(int fd, __u32 size, __u32 *handle) { int ret; void *heap_buf = NULL; const size_t alignment = 64 * 1024 * 1024; @@ -80,7 +84,7 @@ static int alloc_heap(int fd, __u32 size, __u32 *handle) { return -1; } - struct amdxdna_drm_create_bo create_bo_params = { + amdxdna_drm_create_bo create_bo_params = { .type = AMDXDNA_BO_DEV_HEAP, .size = size, }; @@ -90,8 +94,7 @@ static int alloc_heap(int fd, __u32 size, __u32 *handle) { *handle = create_bo_params.handle; } - struct amdxdna_drm_get_bo_info get_bo_info = {.handle = - create_bo_params.handle}; + amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo_params.handle}; ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); if (ret != 0) { perror("Failed to get BO info"); @@ -111,9 +114,9 @@ static int alloc_heap(int fd, __u32 size, __u32 *handle) { /* Creates a dev bo which is carved out of the heap bo. */ -static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, - __u32 *handle, __u64 size_in_bytes) { - struct amdxdna_drm_create_bo create_bo = { +int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, __u32 *handle, + __u64 size_in_bytes) { + amdxdna_drm_create_bo create_bo = { .type = AMDXDNA_BO_DEV, .size = size_in_bytes, }; @@ -123,7 +126,7 @@ static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, return -1; } - struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); if (ret != 0) { perror("Failed to get BO info"); @@ -139,8 +142,8 @@ static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, /* Creates a shmem bo */ -static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, - __u32 *handle, __u64 size_in_bytes) { +int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, + __u32 *handle, __u64 size_in_bytes) { const size_t alignment = 64 * 1024 * 1024; void *shmem_create = NULL; int ret = posix_memalign(&shmem_create, alignment, size_in_bytes); @@ -153,16 +156,16 @@ static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, printf("Shmem BO @: %p\n", shmem_create); - struct amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM, - .vaddr = (__u64)shmem_create, - .size = size_in_bytes}; + amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM, + .vaddr = (__u64)shmem_create, + .size = size_in_bytes}; ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo); if (ret != 0) { perror("Failed to create BO"); return -1; } - struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); if (ret != 0) { perror("Failed to get BO info"); @@ -178,10 +181,8 @@ static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, /* Wrapper around synch bo ioctl. */ -static int sync_bo(int fd, __u32 handle) { - struct amdxdna_drm_sync_bo sync_params = { - .handle = handle, - }; +int sync_bo(int fd, __u32 handle) { + amdxdna_drm_sync_bo sync_params = {.handle = handle}; int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params); if (ret != 0) { printf("Synch bo ioctl failed for handle %d\n", handle); @@ -193,8 +194,8 @@ static int sync_bo(int fd, __u32 handle) { Create a BO_DEV and populate it with a PDI */ -static int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle, - const char *path) { +int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle, + const char *path) { FILE *file = fopen(path, "r"); if (file == NULL) { perror("Failed to open instructions file."); @@ -233,8 +234,8 @@ static int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle, Create a BO DEV and populate it with instructions whose virtual address is passed to the driver via an HSA packet. */ -static int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr, - __u32 *handle, const char *path, __u32 *num_inst) { +int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr, + __u32 *handle, const char *path, __u32 *num_inst) { // read dpu instructions into an array FILE *file = fopen(path, "r"); if (file == NULL) {