From 06725beac47a03226554940c044b7c5964821236 Mon Sep 17 00:00:00 2001
From: makslevental <maksim.levental@gmail.com>
Date: Tue, 27 Aug 2024 17:14:14 -0500
Subject: [PATCH 1/3] [wip] hsa demo

---
 .github/workflows/ci-linux.yml            |  19 ++
 experimental/CMakeLists.txt               |   2 +-
 experimental/hsa/CMakeLists.txt           |   9 +
 experimental/hsa/add_one.pdi              | Bin 0 -> 3552 bytes
 experimental/hsa/add_one_insts.txt        |  68 +++++
 experimental/hsa/aie_hsa_dispatch_test.cc | 309 ++++++++++++++++++++++
 6 files changed, 406 insertions(+), 1 deletion(-)
 create mode 100644 experimental/hsa/CMakeLists.txt
 create mode 100644 experimental/hsa/add_one.pdi
 create mode 100644 experimental/hsa/add_one_insts.txt
 create mode 100644 experimental/hsa/aie_hsa_dispatch_test.cc

diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index 100ea0eca..383b99d8a 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -200,3 +200,22 @@ jobs:
             iree-install \
             print_ir_aie2xclbin_results \
             $PWD/llvm-aie
+
+  test_hsa:
+    name: HSA MWE
+    strategy:
+      fail-fast: false
+    runs-on: linux-phoenix-20240606
+    steps:
+      - name: "Checking out repository" # for test scripts
+        uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
+        with:
+          submodules: false # not required for testbench
+
+      - name: Build and run HSA example
+        run: |
+          cd experimental/hsa
+          mkdir build && pushd build
+          cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja
+          ninja aie_hsa_dispatch_test
+          ./aie_hsa_dispatch_test $PWD/..
diff --git a/experimental/CMakeLists.txt b/experimental/CMakeLists.txt
index ae2678c84..dee0aab59 100644
--- a/experimental/CMakeLists.txt
+++ b/experimental/CMakeLists.txt
@@ -4,4 +4,4 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-iree_add_all_subdirs()
+add_subdirectory(delegate)
diff --git a/experimental/hsa/CMakeLists.txt b/experimental/hsa/CMakeLists.txt
new file mode 100644
index 000000000..8c299a30a
--- /dev/null
+++ b/experimental/hsa/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+find_package(hsa-runtime64 REQUIRED)
+add_executable(aie_hsa_dispatch_test aie_hsa_dispatch_test.cc)
+target_link_libraries(aie_hsa_dispatch_test PUBLIC hsa-runtime64::hsa-runtime64)
diff --git a/experimental/hsa/add_one.pdi b/experimental/hsa/add_one.pdi
new file mode 100644
index 0000000000000000000000000000000000000000..f21475d21f3ef286d180bea6e2a3127c53a48239
GIT binary patch
literal 3552
zcmcInO>7%Q6n^V<vZg8ZI&y+Hb?ByvM2L`TAOfc;?KY08AcXLtk+3*4MJi1pNXf;d
z7h<4P0YVfB72!lGIp=@^2axImZs8CRLXkLG;wLIqxs=Me1l0U2-<#caW*vi~0AtPW
z_ul*Fy_q+&Z`VH&QUBwIBa^SaH27U%Ax{(}1-cG&zIUT@`ylsi)IrYm>Hc)u)J;mr
zN*D9Ey|nSJ>$4nB9)I@2xic4DJ$wG#w!`=~^rN}Xp4JYP=rB6p<z~C>G?ev30Xc@u
z`#1R5n!Oo>gH-=>h(OeeIm~^&Xh#q%qCnxcI;BCNT+pPUI!{~s9)j9G)}~J?lRr!;
zoswTc9wY)4*Zo9!?a}5die*tzK`x!&;GYv4ww=i)4jmaM`YiaO8}8Zc)Q{y~JQm-L
zDCygBKJ;l6zD(IyCVM=ln$PeNp(ce!dB?mT<d91F`b5mp6jhp(x5DG`-3ZB%Aj97k
zqG8#eVX4X92+d-R%U{|+u^)iGx1Ul>8o{;8dfzr~Z-tFp8)4(-df2#Yhm8^R;mvUV
z#=S7+GZUcHlKoI8w%v*TWF>!M{;qwXroEt3Y(=5x6}6z3lbGP-D18AWb`u!u>+Jm)
zdSM}!z4gMnCwcrW5*hQxJMyCX4*OF_*_t(Xd&m0c1C>#|->7B?>&%)W|M2J7Q%kBj
zTyy>ZMgGce{$_d*FqB39W_m>afJOeULN6@7e8IXWd0ai^^XJHy)ce?<ItpuU6#3iT
z-OHcDcJqhvyUkx=K&2O`EmXAgZ@$trxOHg5p2$y*HRVH(Me}cu@pC%kd$xwwwaiWQ
zTN_&D_Lf%4qHk?#&OSKjSD}J?o{2IqgEw5u&gYp(KELUsP#~)XJ$)4U7l>wy3QhiQ
zx1%Zqo&E66?{4#7o$u=L|7-qBH|}YfyS9e9fphNVr+4W*b?Q{KHKVanH)7j$MtVLu
zJ(Uq+KO1!)m+MjH!-kksRk`X7QVyj%)!A_M#MBYO-jxGI!&H;T9Xp)wLlr7+g}J^a
z$)emTTO%r6RyA){A_l5!COh5g>GH#?P4KfEz&1EPZ@icumHM`@KJ+`bR(jobeaS1D
z0C9}TD;y7d+X7buq!e_k4`?Y(y${cIEIe@BN>k>E`gmsTQhacJ1n&uscX91{EN_0T
zr#j#J=#PX(rD;_wU0#>wY@GJ8$NAvrwIL^-UXDq|)Nj#D^FFuQE0k%z*ut|zj;55h
zb`b;Tb=rFe@VhXeKr-9;73p+IW}hm*E6Z&R1=b~ON;n~5OTuXh4@r1b!Z``gNO)et
zixMtMcv-?L5-v;F1m^YeS`xC}a$Y_{FJY77+k9Ye?IiipB|jwbb(t^u)Lk4w$cW;^
ztyB1m0&TgCOw<5@rOV2R-~s>lhC?wOK2D9lfnQ&XZwbETCqw%6Oz^>4{2|9be?Ivd
z_|2TF;LmMBFV2pOc|~lCLKbHU=SY`q5fJK>eXhS2vVd~b;>!M;WuJUQ=9}O#zewKr
zf96{*KSx|-ceazZ<?}yO@!z$R3{5qkRA)X!v%KsbEoJxl{F^S{qIY-2w<`Xx)#Uft
zDWCtLivNYb<%@l*ck(Uv&BZ)`ZQYQ)eM?l?jzNxr$K#S!w*8RVE89X<woqAAnjHJ5
z)oq-B9-rDmAC>ev^aoshQqmJ0B$QLZk1Gc`-vU1Zy@yHC$Dzl%1ScrzgV5ja>bqrq
txm}u1d&OrI`ih@bM=x@Rk0}1OCp;l62O;x+FCCrFt>BnqERV0WzW_#=ycPfe

literal 0
HcmV?d00001

diff --git a/experimental/hsa/add_one_insts.txt b/experimental/hsa/add_one_insts.txt
new file mode 100644
index 000000000..a5e9f9d33
--- /dev/null
+++ b/experimental/hsa/add_one_insts.txt
@@ -0,0 +1,68 @@
+06030100
+00000105
+00000007
+00000110
+00000001
+00000000
+0001D000
+00000030
+00000400
+00000000
+00000000
+00000000
+80000000
+00000000
+00000000
+02000000
+00000081
+00000030
+00000000
+00000000
+00000000
+00000000
+0001D004
+00000000
+00000001
+00000000
+00000000
+00000000
+00000000
+00000000
+0001D204
+00000000
+80000000
+00000018
+00000001
+00000000
+0001D020
+00000030
+00000400
+00000000
+00000000
+00000000
+80000000
+00000000
+00000000
+02000000
+00000081
+00000030
+00000000
+00000000
+00000000
+00000000
+0001D024
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+00000000
+0001D214
+00000000
+00000001
+00000018
+00000080
+00000010
+00000000
+00010100
diff --git a/experimental/hsa/aie_hsa_dispatch_test.cc b/experimental/hsa/aie_hsa_dispatch_test.cc
new file mode 100644
index 000000000..799a060df
--- /dev/null
+++ b/experimental/hsa/aie_hsa_dispatch_test.cc
@@ -0,0 +1,309 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sys/mman.h>
+
+#include <cassert>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+namespace {
+
+hsa_status_t get_agent(hsa_agent_t agent, std::vector<hsa_agent_t> *agents,
+                       hsa_device_type_t requested_dev_type) {
+  if (!agents || !(requested_dev_type == HSA_DEVICE_TYPE_AIE ||
+                   requested_dev_type == HSA_DEVICE_TYPE_GPU ||
+                   requested_dev_type == HSA_DEVICE_TYPE_CPU)) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  hsa_device_type_t device_type;
+  hsa_status_t ret =
+      hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
+
+  if (ret != HSA_STATUS_SUCCESS) {
+    return ret;
+  }
+
+  if (device_type == requested_dev_type) {
+    agents->push_back(agent);
+  }
+
+  return ret;
+}
+
+hsa_status_t get_aie_agents(hsa_agent_t agent, void *data) {
+  if (!data) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  auto *aie_agents = reinterpret_cast<std::vector<hsa_agent_t> *>(data);
+  return get_agent(agent, aie_agents, HSA_DEVICE_TYPE_AIE);
+}
+
+hsa_status_t get_coarse_global_mem_pool(hsa_amd_memory_pool_t pool, void *data,
+                                        bool kernarg) {
+  hsa_amd_segment_t segment_type;
+  auto ret = hsa_amd_memory_pool_get_info(
+      pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type);
+  if (ret != HSA_STATUS_SUCCESS) {
+    return ret;
+  }
+
+  if (segment_type == HSA_AMD_SEGMENT_GLOBAL) {
+    hsa_amd_memory_pool_global_flag_t global_pool_flags;
+    ret = hsa_amd_memory_pool_get_info(
+        pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_pool_flags);
+    if (ret != HSA_STATUS_SUCCESS) {
+      return ret;
+    }
+
+    if (kernarg) {
+      if ((global_pool_flags &
+           HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) &&
+          (global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) {
+        *static_cast<hsa_amd_memory_pool_t *>(data) = pool;
+      }
+    } else {
+      if ((global_pool_flags &
+           HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) &&
+          !(global_pool_flags & HSA_REGION_GLOBAL_FLAG_KERNARG)) {
+        *static_cast<hsa_amd_memory_pool_t *>(data) = pool;
+      }
+    }
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+hsa_status_t get_coarse_global_dev_mem_pool(hsa_amd_memory_pool_t pool,
+                                            void *data) {
+  return get_coarse_global_mem_pool(pool, data, false);
+}
+
+hsa_status_t get_coarse_global_kernarg_mem_pool(hsa_amd_memory_pool_t pool,
+                                                void *data) {
+  return get_coarse_global_mem_pool(pool, data, true);
+}
+
+void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
+                   void **buf) {
+  std::ifstream bin_file(file_name,
+                         std::ios::binary | std::ios::ate | std::ios::in);
+
+  assert(bin_file.fail() == false);
+
+  auto size(bin_file.tellg());
+
+  bin_file.seekg(0, std::ios::beg);
+  auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
+  assert(r == HSA_STATUS_SUCCESS);
+  bin_file.read(reinterpret_cast<char *>(*buf), size);
+}
+
+void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
+                   void **buf) {
+  std::ifstream bin_file(file_name,
+                         std::ios::binary | std::ios::ate | std::ios::in);
+
+  assert(bin_file.fail() == false);
+
+  auto size(bin_file.tellg());
+  bin_file.seekg(0, std::ios::beg);
+  std::vector<uint32_t> pdi_vec;
+  std::string val;
+
+  while (bin_file >> val) {
+    pdi_vec.push_back(std::stoul(val, nullptr, 16));
+  }
+  auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
+  assert(r == HSA_STATUS_SUCCESS);
+  std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
+}
+
+}  // namespace
+
+int main(int argc, char **argv) {
+  std::filesystem::path sourcePath(argv[1]);
+  // List of AIE agents in the system.
+  std::vector<hsa_agent_t> aie_agents;
+  // For creating a queue on an AIE agent.
+  hsa_queue_t *aie_queue(nullptr);
+  // Memory pool for allocating device-mapped memory. Used for PDI/DPU
+  // instructions.
+  hsa_amd_memory_pool_t global_dev_mem_pool{0};
+  // System memory pool. Used for allocating kernel argument data.
+  hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
+  const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
+  const std::string pdi_file_name(sourcePath / "add_one.pdi");
+  uint32_t *dpu_inst_buf(nullptr);
+  uint64_t *pdi_buf(nullptr);
+
+  assert(aie_agents.empty());
+  assert(global_dev_mem_pool.handle == 0);
+  assert(global_kernarg_mem_pool.handle == 0);
+
+  // Initialize the runtime.
+  auto r = hsa_init();
+  assert(r == HSA_STATUS_SUCCESS);
+
+  assert(sizeof(hsa_kernel_dispatch_packet_s) ==
+         sizeof(hsa_amd_aie_ert_packet_s));
+
+  // Test a launch of an AIE kernel using the HSA API.
+  // Find the AIE agents in the system.
+  r = hsa_iterate_agents(get_aie_agents, &aie_agents);
+  assert(r == HSA_STATUS_SUCCESS);
+  //    assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
+  //    HSA_STATUS_SUCCESS);
+  assert(aie_agents.size() == 1);
+
+  const auto &aie_agent = aie_agents.front();
+
+  // Create a queue on the first agent.
+  r = hsa_queue_create(aie_agent, 64, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr,
+                       0, 0, &aie_queue);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(aie_queue);
+  assert(aie_queue->base_address);
+
+  // Find a pool for DEV BOs. This is a global system memory pool that is
+  // mapped to the device. Will be used for PDIs and DPU instructions.
+  r = hsa_amd_agent_iterate_memory_pools(
+      aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  // Find a pool that supports kernel args. This is just normal system memory.
+  // It will be used for commands and input data.
+  r = hsa_amd_agent_iterate_memory_pools(
+      aie_agent, get_coarse_global_kernarg_mem_pool, &global_kernarg_mem_pool);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(global_kernarg_mem_pool.handle);
+
+  // Load the DPU and PDI files into a global pool that doesn't support kernel
+  // args (DEV BO).
+  load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
+                reinterpret_cast<void **>(&dpu_inst_buf));
+  uint32_t dpu_handle = 0;
+  r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(dpu_handle != 0);
+
+  load_pdi_file(global_dev_mem_pool, pdi_file_name,
+                reinterpret_cast<void **>(&pdi_buf));
+  uint32_t pdi_handle = 0;
+  r = hsa_amd_get_handle_from_vaddr(pdi_buf, &pdi_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(pdi_handle != 0);
+
+  hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle,
+                                               .cu_func = 0};
+
+  hsa_amd_aie_ert_hw_ctx_config_cu_param_t config_cu_args{
+      .num_cus = 1, .cu_configs = &cu_config};
+
+  // Configure the queue's hardware context.
+  r = hsa_amd_queue_hw_ctx_config(
+      aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  // create inputs / outputs
+  constexpr std::size_t num_data_elements = 1024;
+  constexpr std::size_t data_buffer_size =
+      num_data_elements * sizeof(std::uint32_t);
+
+  std::uint32_t *input = {};
+  r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
+                                   reinterpret_cast<void **>(&input));
+  assert(r == HSA_STATUS_SUCCESS);
+  std::uint32_t input_handle = {};
+  r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(input_handle != 0);
+
+  std::uint32_t *output = {};
+  r = hsa_amd_memory_pool_allocate(global_dev_mem_pool, data_buffer_size, 0,
+                                   reinterpret_cast<void **>(&output));
+  assert(r == HSA_STATUS_SUCCESS);
+  std::uint32_t output_handle = {};
+  r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  assert(output_handle != 0);
+
+  for (std::size_t i = 0; i < num_data_elements; i++) {
+    *(input + i) = i;
+    *(output + i) = 0xDEFACE;
+  }
+
+  ///////////////////////////////////// Creating the cmd packet
+  // Creating a packet to store the command
+  hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
+  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
+                                   reinterpret_cast<void **>(&cmd_pkt));
+  assert(r == HSA_STATUS_SUCCESS);
+  cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
+  cmd_pkt->count = 0xA;  // # of arguments to put in command
+  cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
+  cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
+  cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
+                           << HSA_PACKET_HEADER_TYPE;
+
+  // Creating the payload for the packet
+  hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
+  uint32_t cmd_handle;
+  r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
+                                    &cmd_handle);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
+                                   reinterpret_cast<void **>(&cmd_payload));
+  assert(r == HSA_STATUS_SUCCESS);
+  cmd_payload->cu_mask = 0x1;  // Selecting the PDI to use with this command
+  cmd_payload->data[0] = 0x3;  // Transaction opcode
+  cmd_payload->data[1] = 0x0;
+  cmd_payload->data[2] = dpu_handle;
+  cmd_payload->data[3] = 0x0;
+  cmd_payload->data[4] = 0x44;  // Size of DPU instruction
+  cmd_payload->data[5] = input_handle;
+  cmd_payload->data[6] = 0;
+  cmd_payload->data[7] = output_handle;
+  cmd_payload->data[8] = 0;
+  cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);
+
+  uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
+  uint64_t packet_id = wr_idx % aie_queue->size;
+  reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
+      aie_queue->base_address)[packet_id] = *cmd_pkt;
+  hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);
+
+  for (std::size_t i = 0; i < num_data_elements; i++) {
+    const auto expected = *(input + i) + 1;
+    const auto result = *(output + i);
+    assert(result == expected);
+  }
+
+  r = hsa_queue_destroy(aie_queue);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  r = hsa_amd_memory_pool_free(output);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_free(input);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_free(pdi_buf);
+  assert(r == HSA_STATUS_SUCCESS);
+  r = hsa_amd_memory_pool_free(dpu_inst_buf);
+  assert(r == HSA_STATUS_SUCCESS);
+
+  r = hsa_shut_down();
+  assert(r == HSA_STATUS_SUCCESS);
+  std::cout << "PASS";
+}

From d174fd44fe06629dd5a7fce91aa13c9077888162 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Tue, 27 Aug 2024 22:09:27 -0500
Subject: [PATCH 2/3] add bare example

---
 .github/workflows/ci-linux.yml            |   3 +
 experimental/hsa/CMakeLists.txt           |   2 +
 experimental/hsa/aie_hsa_bare_add_one.cc  | 523 ++++++++++++++++++++
 experimental/hsa/aie_hsa_dispatch_test.cc |   2 +-
 experimental/hsa/amdxdna_accel.h          | 569 ++++++++++++++++++++++
 experimental/hsa/hsa_ipu.h                | 270 ++++++++++
 6 files changed, 1368 insertions(+), 1 deletion(-)
 create mode 100644 experimental/hsa/aie_hsa_bare_add_one.cc
 create mode 100644 experimental/hsa/amdxdna_accel.h
 create mode 100644 experimental/hsa/hsa_ipu.h

diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index 383b99d8a..5071219ab 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -219,3 +219,6 @@ jobs:
           cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja
           ninja aie_hsa_dispatch_test
           ./aie_hsa_dispatch_test $PWD/..
+          
+          ninja aie_hsa_bare_add_one
+          ./aie_hsa_bare_add_one $PWD/..
diff --git a/experimental/hsa/CMakeLists.txt b/experimental/hsa/CMakeLists.txt
index 8c299a30a..b1655008e 100644
--- a/experimental/hsa/CMakeLists.txt
+++ b/experimental/hsa/CMakeLists.txt
@@ -7,3 +7,5 @@
 find_package(hsa-runtime64 REQUIRED)
 add_executable(aie_hsa_dispatch_test aie_hsa_dispatch_test.cc)
 target_link_libraries(aie_hsa_dispatch_test PUBLIC hsa-runtime64::hsa-runtime64)
+
+add_executable(aie_hsa_bare_add_one aie_hsa_bare_add_one.cc)
diff --git a/experimental/hsa/aie_hsa_bare_add_one.cc b/experimental/hsa/aie_hsa_bare_add_one.cc
new file mode 100644
index 000000000..f5843d7e9
--- /dev/null
+++ b/experimental/hsa/aie_hsa_bare_add_one.cc
@@ -0,0 +1,523 @@
+
+/*
+
+RUN: (add_one_test %S) | FileCheck %s
+CHECK: /dev/accel/accel0 open
+CHECK: Driver version 1.1
+CHECK: Heap buffer @:                  0x7f313c000000
+CHECK: Loading pdi
+CHECK: Pdi file size: 3552
+CHECK: Loading dpu inst
+CHECK: Loading dpu inst
+CHECK: DPU 0 instructions @:             0x7f313c008000
+CHECK: DPU 1 instructions @:             0x7f313c010000
+CHECK: PDI file @:                     0x7f313c000000
+CHECK: PDI handle @:                     2
+CHECK: Input @:             0x7f313c018000
+CHECK: Output @:             0x7f313c020000
+CHECK: Input @:             0x7f313c028000
+CHECK: Output @:             0x7f313c030000
+CHECK: Size of param_config_cu: 0x8
+CHECK: Synch bo ioctl failed for handle 11
+CHECK: Synch bo ioctl failed for handle 9
+CHECK: Synch bo ioctl failed for handle 10
+CHECK: Checking run 0:
+CHECK: Checking run 1:
+CHECK: PASS!
+CHECK: Closing
+CHECK: Done
+
+ */
+
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+
+#include "amdxdna_accel.h"
+#include "hsa_ipu.h"
+
+#define DATA_BUFFER_SIZE (1024 * 4)
+
+/*
+ * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
+ */
+struct amdxdna_cmd_chain {
+  uint32_t command_count;
+  uint32_t submit_index;
+  uint32_t error_index;
+  uint32_t reserved[3];
+  uint64_t data[] __counted_by(command_count);
+};
+
+/* Exec buffer command header format */
+struct amdxdna_cmd {
+  union {
+    struct {
+      uint32_t state : 4;
+      uint32_t unused : 6;
+      uint32_t extra_cu_masks : 2;
+      uint32_t count : 11;
+      uint32_t opcode : 5;
+      uint32_t reserved : 4;
+    };
+    uint32_t header;
+  };
+  uint32_t data[] __counted_by(count);
+};
+
+// These packets are variable width but using this as a
+// maximum size for now
+#define PACKET_SIZE 64
+
+int main(int argc, char **argv) {
+  int drv_fd;
+  int ret;
+  const char drv_path[] = "/dev/accel/accel0";
+  std::string test_dir(argv[1]);
+  std::string inst_path = test_dir + "/add_one_insts.txt";
+  std::string pdi_path_str = test_dir + "/add_one.pdi";
+  const char *dpu_inst_path = inst_path.c_str();
+  const char *pdi_path = pdi_path_str.c_str();  // Add one kernel
+  uint32_t heap_handle;
+  uint32_t major, minor;
+
+  // open the driver
+  drv_fd = open(drv_path, O_RDWR);
+
+  if (drv_fd < 0) {
+    printf("Error %i opening %s\n", drv_fd, drv_path);
+    return -1;
+  }
+
+  printf("%s open\n", drv_path);
+
+  // get driver version
+  if (get_driver_version(drv_fd, &major, &minor) < 0) {
+    printf("Error getting driver version\n");
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  printf("Driver version %u.%u\n", major, minor);
+
+  /////////////////////////////////////////////////////////////////////////////////
+  // Step 0: Allocate the necessary BOs. This includes:
+  // 1. The operands for the two kernels that will be launched
+  // 2. A heap which contains:
+  //  a. A PDI for the design that will be run
+  //  b. Instruction sequences for both runs
+
+  // reserve some device memory for the heap
+  if (alloc_heap(drv_fd, 48 * 1024 * 1024, &heap_handle) < 0) {
+    perror("Error allocating device heap");
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  uint64_t pdi_vaddr;
+  uint64_t pdi_sram_vaddr;
+  uint32_t pdi_handle;
+  printf("Loading pdi\n");
+  ret = load_pdi(drv_fd, &pdi_vaddr, &pdi_sram_vaddr, &pdi_handle, pdi_path);
+  if (ret < 0) {
+    printf("Error %i loading pdi\n", ret);
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  uint64_t dpu_0_vaddr;
+  uint64_t dpu_0_sram_vaddr;
+  uint32_t dpu_0_handle;
+  uint32_t num_dpu_0_insts;
+  printf("Loading dpu inst\n");
+  ret = load_instructions(drv_fd, &dpu_0_vaddr, &dpu_0_sram_vaddr,
+                          &dpu_0_handle, dpu_inst_path, &num_dpu_0_insts);
+  if (ret < 0) {
+    printf("Error %i loading dpu instructions\n", ret);
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  uint64_t dpu_1_vaddr;
+  uint64_t dpu_1_sram_vaddr;
+  uint32_t dpu_1_handle;
+  uint32_t num_dpu_1_insts;
+  printf("Loading dpu inst\n");
+  ret = load_instructions(drv_fd, &dpu_1_vaddr, &dpu_1_sram_vaddr,
+                          &dpu_1_handle, dpu_inst_path, &num_dpu_1_insts);
+  if (ret < 0) {
+    printf("Error %i loading dpu instructions\n", ret);
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  printf("DPU 0 instructions @:             %p\n", (void *)dpu_0_vaddr);
+  printf("DPU 1 instructions @:             %p\n", (void *)dpu_1_vaddr);
+  printf("PDI file @:                     %p\n", (void *)pdi_vaddr);
+  printf("PDI handle @:                     %d\n", pdi_handle);
+
+  uint64_t input_0;
+  uint64_t input_0_sram_vaddr;
+  uint32_t input_0_handle;
+  ret = create_dev_bo(drv_fd, &input_0, &input_0_sram_vaddr, &input_0_handle,
+                      DATA_BUFFER_SIZE);
+  // ret = create_shmem_bo(drv_fd, &input_0, &input_0_sram_vaddr,
+  // &input_0_handle, DATA_BUFFER_SIZE);
+  printf("Input @:             %p\n", (void *)input_0);
+  if (ret < 0) {
+    printf("Error %i creating data 0\n", ret);
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  uint64_t output_0;
+  uint64_t output_0_sram_vaddr;
+  uint32_t output_0_handle;
+  ret = create_dev_bo(drv_fd, &output_0, &output_0_sram_vaddr, &output_0_handle,
+                      DATA_BUFFER_SIZE);
+  // ret = create_shmem_bo(drv_fd, &output_0, &output_0_sram_vaddr,
+  // &output_0_handle, DATA_BUFFER_SIZE);
+  printf("Output @:             %p\n", (void *)output_0);
+  if (ret < 0) {
+    printf("Error %i creating data 1\n", ret);
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  uint64_t input_1;
+  uint64_t input_1_sram_vaddr;
+  uint32_t input_1_handle;
+  ret = create_dev_bo(drv_fd, &input_1, &input_1_sram_vaddr, &input_1_handle,
+                      DATA_BUFFER_SIZE);
+  // ret = create_shmem_bo(drv_fd, &input_1, &input_1_sram_vaddr,
+  // &input_1_handle, DATA_BUFFER_SIZE);
+  printf("Input @:             %p\n", (void *)input_1);
+  if (ret < 0) {
+    printf("Error %i creating data 0\n", ret);
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  uint64_t output_1;
+  uint64_t output_1_sram_vaddr;
+  uint32_t output_1_handle;
+  ret = create_dev_bo(drv_fd, &output_1, &output_1_sram_vaddr, &output_1_handle,
+                      DATA_BUFFER_SIZE);
+  // ret = create_shmem_bo(drv_fd, &output_1, &output_1_sram_vaddr,
+  // &output_1_handle, DATA_BUFFER_SIZE);
+  printf("Output @:             %p\n", (void *)output_1);
+  if (ret < 0) {
+    printf("Error %i creating data 1\n", ret);
+    printf("Closing\n");
+    close(drv_fd);
+    printf("Done\n");
+    return -1;
+  }
+
+  for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) {
+    *((uint32_t *)input_0 + i) = i;
+    *((uint32_t *)input_1 + i) = i + 0xFEEDED1E;
+    *((uint32_t *)output_0 + i) = 0xDEFACE;
+    *((uint32_t *)output_1 + i) = 0xDEADBEEF;
+  }
+
+  // Writing the user buffers
+  sync_bo(drv_fd, input_0_handle);
+  sync_bo(drv_fd, output_0_handle);
+  sync_bo(drv_fd, input_1_handle);
+  sync_bo(drv_fd, output_1_handle);
+
+  // Performing a sync on the queue descriptor, completion signal, queue buffer
+  // and config cu bo.
+  sync_bo(drv_fd, dpu_0_handle);
+  sync_bo(drv_fd, dpu_1_handle);
+  sync_bo(drv_fd, pdi_handle);
+  sync_bo(drv_fd, input_0_handle);
+  sync_bo(drv_fd, output_0_handle);
+
+  /////////////////////////////////////////////////////////////////////////////////
+  // Step 1: Create a user mode queue
+  // This is going to be where we create a queue where we:
+  // 1. Create and configure a hardware context
+  // 2. Allocate the queue buffer as a user-mode queue
+
+  // Allocating a structure to store QOS information
+  struct amdxdna_qos_info *qos =
+      (struct amdxdna_qos_info *)malloc(sizeof(struct amdxdna_qos_info));
+  qos->gops = 0;
+  qos->fps = 0;
+  qos->dma_bandwidth = 0;
+  qos->latency = 0;
+  qos->frame_exec_time = 0;
+  qos->priority = 0;
+
+  // This is the structure that we pass
+  struct amdxdna_drm_create_hwctx create_hw_ctx = {
+      .ext = 0,
+      .ext_flags = 0,
+      .qos_p = (uint64_t)qos,
+      .umq_bo = 0,
+      .log_buf_bo = 0,
+      .max_opc = 0x800,  // Not sure what this is but this was the value used
+      .num_tiles = 4,
+      .mem_size = 0,
+      .umq_doorbell = 0,
+  };
+  ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hw_ctx);
+  if (ret != 0) {
+    perror("Failed to create hwctx");
+    return -1;
+  }
+
+  // Creating a structure to configure the CU
+  struct amdxdna_cu_config cu_config = {
+      .cu_bo = pdi_handle,
+      .cu_func = 0,
+  };
+
+  // Creating a structure to configure the hardware context
+  struct amdxdna_hwctx_param_config_cu param_config_cu;
+  param_config_cu.num_cus = 1;
+  param_config_cu.cu_configs[0] = cu_config;
+
+  printf("Size of param_config_cu: 0x%lx\n", sizeof(param_config_cu));
+
+  // Configuring the hardware context with the PDI
+  struct amdxdna_drm_config_hwctx config_hw_ctx = {
+      .handle = create_hw_ctx.handle,
+      .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU,
+      .param_val =
+          (uint64_t)&param_config_cu,  // Pass in the pointer to the param value
+      .param_val_size = 0x10,          // Size of param config CU is 16B
+  };
+  ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx);
+  if (ret != 0) {
+    perror("Failed to config hwctx");
+    return -1;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////
+  // Step 2: Configuring the CMD BOs with the different instruction sequences
+  struct amdxdna_drm_create_bo create_cmd_bo_0 = {
+      .type = AMDXDNA_BO_CMD,
+      .size = PACKET_SIZE,
+  };
+  int cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_0);
+  if (cmd_bo_ret != 0) {
+    perror("Failed to create cmd_0");
+    return -1;
+  }
+
+  struct amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = {
+      .handle = create_cmd_bo_0.handle};
+  ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_0_get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get cmd BO 0 info");
+    return -2;
+  }
+
+  // Writing the first packet to the queue
+  struct amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap(
+      0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd,
+      cmd_bo_0_get_bo_info.map_offset);
+  cmd_0->state = 1;  // ERT_CMD_STATE_NEW;
+  cmd_0->extra_cu_masks = 0;
+  cmd_0->count = 0xF;    // NOTE: For some reason this needs to be larger
+  cmd_0->opcode = 0x0;   // ERT_START_CU;
+  cmd_0->data[0] = 0x3;  // NOTE: This one seems to be skipped
+  cmd_0->data[1] = 0x3;  // Transaction opcode
+  cmd_0->data[2] = 0x0;
+  cmd_0->data[3] = dpu_0_sram_vaddr;
+  cmd_0->data[4] = 0x0;
+  cmd_0->data[5] = 0x44;                           // Size of DPU instruction
+  cmd_0->data[6] = input_0 & 0xFFFFFFFF;           // Input low
+  cmd_0->data[7] = (input_0 >> 32) & 0xFFFFFFFF;   // Input high
+  cmd_0->data[8] = output_0 & 0xFFFFFFFF;          // Output low
+  cmd_0->data[9] = (output_0 >> 32) & 0xFFFFFFFF;  // Output high
+
+  // Writing to the second packet of the queue
+  struct amdxdna_drm_create_bo create_cmd_bo_1 = {
+      .type = AMDXDNA_BO_CMD,
+      .size = PACKET_SIZE,
+  };
+  cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_1);
+  if (cmd_bo_ret != 0) {
+    perror("Failed to create cmd_1");
+    return -1;
+  }
+
+  struct amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = {
+      .handle = create_cmd_bo_1.handle};
+  ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_1_get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get cmd BO 0 info");
+    return -2;
+  }
+
+  struct amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap(
+      0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd,
+      cmd_bo_1_get_bo_info.map_offset);
+  cmd_1->state = 1;  // ERT_CMD_STATE_NEW;
+  cmd_1->extra_cu_masks = 0;
+  cmd_1->count = 10;     // Number of commands
+  cmd_1->opcode = 0x0;   // ERT_START_CU;
+  cmd_1->data[0] = 0x3;  // This one seems to be skipped
+  cmd_1->data[1] = 0x3;  // Transaction opcode
+  cmd_1->data[2] = 0x0;
+  cmd_1->data[3] = dpu_1_sram_vaddr;
+  cmd_1->data[4] = 0x0;
+  cmd_1->data[5] = 0x44;                           // Size of DPU instruction
+  cmd_1->data[6] = input_1 & 0xFFFFFFFF;           // Input low
+  cmd_1->data[7] = (input_1 >> 32) & 0xFFFFFFFF;   // Input high
+  cmd_1->data[8] = output_1 & 0xFFFFFFFF;          // Output low
+  cmd_1->data[9] = (output_1 >> 32) & 0xFFFFFFFF;  // Output high
+
+  /////////////////////////////////////////////////////////////////////////////////
+  // Step 3: Submit commands -- This requires creating a BO_EXEC that contains
+  // the command chain that points to the instruction sequences just created
+
+  // Allocate a command chain
+  void *bo_cmd_chain_buf = NULL;
+  cmd_bo_ret = posix_memalign(&bo_cmd_chain_buf, 4096, 4096);
+  if (cmd_bo_ret != 0 || bo_cmd_chain_buf == NULL) {
+    printf("[ERROR] Failed to allocate cmd_bo buffer of size %d\n", 4096);
+  }
+
+  struct amdxdna_drm_create_bo create_cmd_chain_bo = {
+      .type = AMDXDNA_BO_CMD,
+      .size = 4096,
+  };
+  cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_chain_bo);
+  if (cmd_bo_ret != 0) {
+    perror("Failed to create command chain BO");
+    return -1;
+  }
+
+  struct amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = {
+      .handle = create_cmd_chain_bo.handle};
+  ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_chain_bo_get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get cmd BO 0 info");
+    return -2;
+  }
+
+  struct amdxdna_cmd *cmd_chain =
+      (struct amdxdna_cmd *)mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED,
+                                 drv_fd, cmd_chain_bo_get_bo_info.map_offset);
+
+  // Writing information to the command buffer
+  struct amdxdna_cmd_chain *cmd_chain_payload =
+      (struct amdxdna_cmd_chain *)(cmd_chain->data);
+  cmd_chain->state = 1;  // ERT_CMD_STATE_NEW;
+  cmd_chain->extra_cu_masks = 0;
+  cmd_chain->count = 0xA;    // TODO: Why is this the value?
+  cmd_chain->opcode = 0x13;  // ERT_CMD_CHAIN
+  cmd_chain_payload->command_count = 2;
+  cmd_chain_payload->submit_index = 0;
+  cmd_chain_payload->error_index = 0;
+  cmd_chain_payload->data[0] = create_cmd_bo_0.handle;
+  cmd_chain_payload->data[1] = create_cmd_bo_1.handle;
+
+  // Reading the user buffers
+  sync_bo(drv_fd, create_cmd_chain_bo.handle);
+  sync_bo(drv_fd, create_cmd_bo_0.handle);
+  sync_bo(drv_fd, create_cmd_bo_1.handle);
+
+  // Perform a submit cmd
+  uint32_t bo_args[6] = {dpu_0_handle,    dpu_1_handle,   input_0_handle,
+                         output_0_handle, input_1_handle, output_1_handle};
+  struct amdxdna_drm_exec_cmd exec_cmd_0 = {
+      .ext = 0,
+      .ext_flags = 0,
+      .hwctx = create_hw_ctx.handle,
+      .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF,
+      .cmd_handles = create_cmd_chain_bo.handle,
+      .args = (uint64_t)bo_args,
+      .cmd_count = 1,
+      .arg_count = sizeof(bo_args) / sizeof(uint32_t),
+  };
+  ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, &exec_cmd_0);
+  if (ret != 0) {
+    perror("Failed to submit work");
+    return -1;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////
+  // Step 4: Wait for the output
+  // Use the wait IOCTL to wait for our submission to complete
+  struct amdxdna_drm_wait_cmd wait_cmd = {
+      .hwctx = create_hw_ctx.handle,
+      .timeout = 50,  // 50ms timeout
+      .seq = exec_cmd_0.seq,
+  };
+
+  ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd);
+  if (ret != 0) {
+    perror("Failed to wait");
+    return -1;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////
+  // Step 5: Verify output
+
+  // Reading the user buffers
+  sync_bo(drv_fd, input_0_handle);
+  sync_bo(drv_fd, output_0_handle);
+  sync_bo(drv_fd, input_1_handle);
+  sync_bo(drv_fd, output_1_handle);
+
+  int errors = 0;
+  printf("Checking run 0:\n");
+  for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) {
+    uint32_t src = *((uint32_t *)input_0 + i);
+    uint32_t dst = *((uint32_t *)output_0 + i);
+    // printf("src: 0x%x\n", src);
+    // printf("dst: 0x%x\n", dst);
+    if (src + 1 != dst) {
+      printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst);
+      errors++;
+    }
+  }
+
+  printf("Checking run 1:\n");
+  for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) {
+    uint32_t src = *((uint32_t *)input_1 + i);
+    uint32_t dst = *((uint32_t *)output_1 + i);
+    // printf("src: 0x%x\n", src);
+    // printf("dst: 0x%x\n", dst);
+    if (src + 1 != dst) {
+      printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst);
+      errors++;
+    }
+  }
+
+  if (!errors) {
+    printf("PASS!\n");
+  } else {
+    printf("FAIL! %d/2048\n", errors);
+  }
+
+  printf("Closing\n");
+  close(drv_fd);
+  printf("Done\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/experimental/hsa/aie_hsa_dispatch_test.cc b/experimental/hsa/aie_hsa_dispatch_test.cc
index 799a060df..fca55e4a4 100644
--- a/experimental/hsa/aie_hsa_dispatch_test.cc
+++ b/experimental/hsa/aie_hsa_dispatch_test.cc
@@ -305,5 +305,5 @@ int main(int argc, char **argv) {
 
   r = hsa_shut_down();
   assert(r == HSA_STATUS_SUCCESS);
-  std::cout << "PASS";
+  std::cout << "PASS\n";
 }
diff --git a/experimental/hsa/amdxdna_accel.h b/experimental/hsa/amdxdna_accel.h
new file mode 100644
index 000000000..048fd3fe6
--- /dev/null
+++ b/experimental/hsa/amdxdna_accel.h
@@ -0,0 +1,569 @@
+/* SPDX-License-Identifier: NCSA OR GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef AMDXDNA_ACCEL_H_
+#define AMDXDNA_ACCEL_H_
+
+#include <libdrm/drm.h>
+#include <linux/const.h>
+#include <linux/stddef.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef __counted_by
+#define __counted_by(cnt)
+#endif
+
+#define AMDXDNA_DRIVER_MAJOR 1
+#define AMDXDNA_DRIVER_MINOR 0
+
+#define AMDXDNA_INVALID_CMD_HANDLE (~0UL)
+#define AMDXDNA_INVALID_ADDR (~0UL)
+#define AMDXDNA_INVALID_CTX_HANDLE 0
+#define AMDXDNA_INVALID_BO_HANDLE 0
+
+/*
+ * The interface can grow/extend over time.
+ * On each struct amdxdna_drm_*, to support potential extension, we defined it
+ * like this.
+ *
+ * Example code:
+ *
+ * struct amdxdna_drm_example_data {
+ *	.ext = (uintptr_t)&example_data_ext;
+ *	...
+ * };
+ *
+ * We don't have extension now. The extension struct will define in the future.
+ */
+
+enum amdxdna_drm_ioctl_id {
+  DRM_AMDXDNA_CREATE_HWCTX,
+  DRM_AMDXDNA_DESTROY_HWCTX,
+  DRM_AMDXDNA_CONFIG_HWCTX,
+  DRM_AMDXDNA_CREATE_BO,
+  DRM_AMDXDNA_GET_BO_INFO,
+  DRM_AMDXDNA_SYNC_BO,
+  DRM_AMDXDNA_EXEC_CMD,
+  DRM_AMDXDNA_WAIT_CMD,
+  DRM_AMDXDNA_GET_INFO,
+  DRM_AMDXDNA_SET_STATE,
+  DRM_AMDXDNA_NUM_IOCTLS
+};
+
+enum amdxdna_device_type {
+  AMDXDNA_DEV_TYPE_UNKNOWN = -1,
+  AMDXDNA_DEV_TYPE_KMQ,
+  AMDXDNA_DEV_TYPE_UMQ,
+};
+
+/**
+ * struct qos_info - QoS information for driver.
+ * @gops: Giga operations per second.
+ * @fps: Frames per second.
+ * @dma_bandwidth: DMA bandwidtha.
+ * @latency: Frame response latency.
+ * @frame_exec_time: Frame execution time.
+ * @priority: Request priority.
+ *
+ * User program can provide QoS hints to driver.
+ */
+struct amdxdna_qos_info {
+  __u32 gops;
+  __u32 fps;
+  __u32 dma_bandwidth;
+  __u32 latency;
+  __u32 frame_exec_time;
+  __u32 priority;
+};
+
+/**
+ * struct amdxdna_drm_create_hwctx - Create hardware context.
+ * @ext: MBZ.
+ * @ext_flags: MBZ.
+ * @qos_p: Address of QoS info.
+ * @umq_bo: BO handle for user mode queue(UMQ).
+ * @log_buf_bo: BO handle for log buffer.
+ * @max_opc: Maximum operations per cycle.
+ * @num_tiles: Number of AIE tiles.
+ * @mem_size: Size of AIE tile memory.
+ * @umq_doorbell: Returned offset of doorbell associated with UMQ.
+ * @handle: Returned hardware context handle.
+ */
+struct amdxdna_drm_create_hwctx {
+  __u64 ext;
+  __u64 ext_flags;
+  __u64 qos_p;
+  __u32 umq_bo;
+  __u32 log_buf_bo;
+  __u32 max_opc;
+  __u32 num_tiles;
+  __u32 mem_size;
+  __u32 umq_doorbell;
+  __u32 handle;
+};
+
+/**
+ * struct amdxdna_drm_destroy_hwctx - Destroy hardware context.
+ * @handle: Hardware context handle.
+ * @pad: MBZ.
+ */
+struct amdxdna_drm_destroy_hwctx {
+  __u32 handle;
+  __u32 pad;
+};
+
+/**
+ * struct amdxdna_cu_config - configuration for one CU
+ * @cu_bo: CU configuration buffer bo handle
+ * @cu_func: Functional of a CU
+ * @pad: MBZ
+ */
+struct amdxdna_cu_config {
+  __u32 cu_bo;
+  __u8 cu_func;
+  __u8 pad[3];
+};
+
+/**
+ * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware
+ * context
+ * @num_cus: Number of CUs to configure
+ * @pad: MBZ
+ * @cu_configs: Array of CU configurations of struct amdxdna_cu_config
+ */
+struct amdxdna_hwctx_param_config_cu {
+  __u16 num_cus;
+  __u16 pad[3];
+  struct amdxdna_cu_config cu_configs[] __counted_by(num_cus);
+};
+
+enum amdxdna_drm_config_hwctx_param {
+  DRM_AMDXDNA_HWCTX_CONFIG_CU,
+  DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF,
+  DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF,
+  DRM_AMDXDNA_HWCTX_CONFIG_NUM
+};
+
+/**
+ * struct amdxdna_drm_config_hwctx - Configure hardware context.
+ * @handle: hardware context handle.
+ * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the
+ *              structure passed in via param_val.
+ * @param_val: A structure specified by the param_type struct member.
+ * @param_val_size: Size of the parameter buffer pointed to by the param_val.
+ *		    If param_val is not a pointer, driver can ignore this.
+ *
+ * Note: if the param_val is a pointer pointing to a buffer, the maximum size
+ * of the buffer is 4KiB(PAGE_SIZE).
+ */
+struct amdxdna_drm_config_hwctx {
+  __u32 handle;
+  __u32 param_type;
+  __u64 param_val;
+  __u32 param_val_size;
+  __u32 pad;
+};
+
+/*
+ * AMDXDNA_BO_SHMEM:	DRM GEM SHMEM bo
+ * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory
+ * AMDXDNA_BO_DEV_BO:	Allocated from BO_DEV_HEAP
+ * AMDXDNA_BO_CMD:	User and driver accessible bo
+ * AMDXDNA_BO_DMA:	DRM GEM DMA bo
+ */
+enum amdxdna_bo_type {
+  AMDXDNA_BO_INVALID = 0,
+  AMDXDNA_BO_SHMEM,
+  AMDXDNA_BO_DEV_HEAP,
+  AMDXDNA_BO_DEV,
+  AMDXDNA_BO_CMD,
+  AMDXDNA_BO_DMA,
+};
+
+/**
+ * struct amdxdna_drm_create_bo - Create a buffer object.
+ * @flags: Buffer flags. MBZ.
+ * @type: Buffer type.
+ * @vaddr: User VA of buffer if applied. MBZ.
+ * @size: Size in bytes.
+ * @handle: Returned DRM buffer object handle.
+ */
+struct amdxdna_drm_create_bo {
+  __u64 flags;
+  __u32 type;
+  __u32 _pad;
+  __u64 vaddr;
+  __u64 size;
+  __u32 handle;
+};
+
+/**
+ * struct amdxdna_drm_get_bo_info - Get buffer object information.
+ * @ext: MBZ.
+ * @ext_flags: MBZ.
+ * @handle: DRM buffer object handle.
+ * @map_offset: Returned DRM fake offset for mmap().
+ * @vaddr: Returned user VA of buffer. 0 in case user needs mmap().
+ * @xdna_addr: Returned XDNA device virtual address.
+ */
+struct amdxdna_drm_get_bo_info {
+  __u64 ext;
+  __u64 ext_flags;
+  __u32 handle;
+  __u32 _pad;
+  __u64 map_offset;
+  __u64 vaddr;
+  __u64 xdna_addr;
+};
+
+/**
+ * struct amdxdna_drm_sync_bo - Sync buffer object.
+ * @handle: Buffer object handle.
+ * @direction: Direction of sync, can be from device or to device.
+ * @offset: Offset in the buffer to sync.
+ * @size: Size in bytes.
+ */
+struct amdxdna_drm_sync_bo {
+  __u32 handle;
+#define SYNC_DIRECT_TO_DEVICE 0U
+#define SYNC_DIRECT_FROM_DEVICE 1U
+  __u32 direction;
+  __u64 offset;
+  __u64 size;
+};
+
+enum amdxdna_cmd_type {
+  AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0,
+  AMDXDNA_CMD_SUBMIT_DEPENDENCY,
+  AMDXDNA_CMD_SUBMIT_SIGNAL,
+};
+
+/**
+ * struct amdxdna_drm_exec_cmd - Execute command.
+ * @ext: MBZ.
+ * @ext_flags: MBZ.
+ * @hwctx: Hardware context handle.
+ * @type: One of command type in enum amdxdna_cmd_type.
+ * @cmd_handles: Array of command handles or the command handle itself in case
+ * of just one.
+ * @args: Array of arguments for all command handles.
+ * @cmd_count: Number of command handles in the cmd_handles array.
+ * @arg_count: Number of arguments in the args array.
+ * @seq: Returned sequence number for this command.
+ */
+struct amdxdna_drm_exec_cmd {
+  __u64 ext;
+  __u64 ext_flags;
+  __u32 hwctx;
+  __u32 type;
+  __u64 cmd_handles;
+  __u64 args;
+  __u32 cmd_count;
+  __u32 arg_count;
+  __u64 seq;
+};
+
+/**
+ * struct amdxdna_drm_wait_cmd - Wait exectuion command.
+ *
+ * @hwctx: hardware context handle.
+ * @timeout: timeout in ms, 0 implies infinite wait.
+ * @seq: sequence number of the command returned by execute command.
+ *
+ * Wait a command specified by seq to be completed.
+ * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
+ * to submit a new command.
+ */
+struct amdxdna_drm_wait_cmd {
+  __u32 hwctx;
+  __u32 timeout;
+  __u64 seq;
+};
+
+/**
+ * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware
+ * @buffer: The user space buffer that will return the AIE status
+ * @buffer_size: The size of the user space buffer
+ * @cols_filled: A bitmap of AIE columns whose data has been returned in the
+ * buffer.
+ */
+struct amdxdna_drm_query_aie_status {
+  __u64 buffer;      /* out */
+  __u32 buffer_size; /* in */
+  __u32 cols_filled; /* out */
+};
+
+/**
+ * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware
+ * @major: The major version number
+ * @minor: The minor version number
+ */
+struct amdxdna_drm_query_aie_version {
+  __u32 major; /* out */
+  __u32 minor; /* out */
+};
+
+/**
+ * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile
+ * (core, mem, shim)
+ * @row_count: The number of rows.
+ * @row_start: The starting row number.
+ * @dma_channel_count: The number of dma channels.
+ * @lock_count: The number of locks.
+ * @event_reg_count: The number of events.
+ * @pad: MBZ.
+ */
+struct amdxdna_drm_query_aie_tile_metadata {
+  __u16 row_count;
+  __u16 row_start;
+  __u16 dma_channel_count;
+  __u16 lock_count;
+  __u16 event_reg_count;
+  __u16 pad[3];
+};
+
+/**
+ * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE
+ * hardware
+ * @col_size: The size of a column in bytes.
+ * @cols: The total number of columns.
+ * @rows: The total number of rows.
+ * @version: The version of the AIE hardware.
+ * @core: The metadata for all core tiles.
+ * @mem: The metadata for all mem tiles.
+ * @shim: The metadata for all shim tiles.
+ */
+struct amdxdna_drm_query_aie_metadata {
+  __u32 col_size;
+  __u16 cols;
+  __u16 rows;
+  struct amdxdna_drm_query_aie_version version;
+  struct amdxdna_drm_query_aie_tile_metadata core;
+  struct amdxdna_drm_query_aie_tile_metadata mem;
+  struct amdxdna_drm_query_aie_tile_metadata shim;
+};
+
+/**
+ * struct amdxdna_drm_query_clock - Metadata for a clock
+ * @name: The clock name.
+ * @freq_mhz: The clock frequency.
+ * @pad: MBZ.
+ */
+struct amdxdna_drm_query_clock {
+  __u8 name[16];
+  __u32 freq_mhz;
+  __u32 pad;
+};
+
+/**
+ * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks
+ * @mp_npu_clock: The metadata for MP-NPU clock.
+ * @h_clock: The metadata for H clock.
+ */
+struct amdxdna_drm_query_clock_metadata {
+  struct amdxdna_drm_query_clock mp_npu_clock;
+  struct amdxdna_drm_query_clock h_clock;
+};
+
+enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER };
+
+/**
+ * struct amdxdna_drm_query_sensor - The data for single sensor.
+ * @label: The name for a sensor.
+ * @input: The current value of the sensor.
+ * @max: The maximum value possible for the sensor.
+ * @average: The average value of the sensor.
+ * @highest: The highest recorded sensor value for this driver load for the
+ * sensor.
+ * @status: The sensor status.
+ * @units: The sensor units.
+ * @unitm: Translates value member variables into the correct unit via (pow(10,
+ * unitm) * value)
+ * @type: The sensor type from enum amdxdna_sensor_type
+ * @pad: MBZ.
+ */
+struct amdxdna_drm_query_sensor {
+  __u8 label[64];
+  __u32 input;
+  __u32 max;
+  __u32 average;
+  __u32 highest;
+  __u8 status[64];
+  __u8 units[16];
+  __s8 unitm;
+  __u8 type;
+  __u8 pad[6];
+};
+
+/**
+ * struct amdxdna_drm_query_hwctx - The data for single context.
+ * @context_id: The ID for this context.
+ * @start_col: The starting column for the partition assigned to this context.
+ * @num_col: The number of columns in the partition assigned to this context.
+ * @pid: The Process ID of the process that created this context.
+ * @command_submissions: The number of commands submitted to this context.
+ * @command_completions: The number of commands completed by this context.
+ * @migrations: The number of times this context has been moved to a different
+ * partition.
+ * @preemptions: The number of times this context has been preempted by another
+ * context in the same partition.
+ * @pad: MBZ.
+ */
+struct amdxdna_drm_query_hwctx {
+  __u32 context_id;
+  __u32 start_col;
+  __u32 num_col;
+  __u32 pad;
+  __s64 pid;
+  __u64 command_submissions;
+  __u64 command_completions;
+  __u64 migrations;
+  __u64 preemptions;
+  __u64 errors;
+};
+
+/**
+ * struct amdxdna_drm_aie_mem - The data for AIE memory read/write
+ * @col:   The AIE column index
+ * @row:   The AIE row index
+ * @addr:  The AIE memory address to read/write
+ * @size:  The size of bytes to read/write
+ * @buf_p: The buffer to store read/write data
+ *
+ * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM
+ * parameters.
+ */
+struct amdxdna_drm_aie_mem {
+  __u32 col;
+  __u32 row;
+  __u32 addr;
+  __u32 size;
+  __u64 buf_p;
+};
+
+/**
+ * struct amdxdna_drm_aie_reg - The data for AIE register read/write
+ * @col: The AIE column index
+ * @row: The AIE row index
+ * @addr: The AIE register address to read/write
+ * @val: The value to write or returned value from AIE
+ *
+ * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG
+ * parameters.
+ */
+struct amdxdna_drm_aie_reg {
+  __u32 col;
+  __u32 row;
+  __u32 addr;
+  __u32 val;
+};
+
+enum amdxdna_drm_get_param {
+  DRM_AMDXDNA_QUERY_AIE_STATUS,
+  DRM_AMDXDNA_QUERY_AIE_METADATA,
+  DRM_AMDXDNA_QUERY_AIE_VERSION,
+  DRM_AMDXDNA_QUERY_CLOCK_METADATA,
+  DRM_AMDXDNA_QUERY_SENSORS,
+  DRM_AMDXDNA_QUERY_HW_CONTEXTS,
+  DRM_AMDXDNA_READ_AIE_MEM,
+  DRM_AMDXDNA_READ_AIE_REG,
+  DRM_AMDXDNA_NUM_GET_PARAM,
+};
+
+/**
+ * struct amdxdna_drm_get_info - Get some information from the AIE hardware.
+ * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed
+ * in the buffer.
+ * @buffer_size: Size of the input buffer. Size needed/written by the kernel.
+ * @buffer: A structure specified by the param struct member.
+ */
+struct amdxdna_drm_get_info {
+  __u32 param;       /* in */
+  __u32 buffer_size; /* in/out */
+  __u64 buffer;      /* in/out */
+};
+
+enum amdxdna_power_mode_type {
+  XRT_POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */
+  XRT_POWER_MODE_LOW,     /**< Set frequency to lowest DPM */
+  XRT_POWER_MODE_MEDIUM,  /**< Set frequency to medium DPM */
+  XRT_POWER_MODE_HIGH,    /**< Set frequency to highest DPM */
+};
+
+/**
+ * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware
+ * @power_mode: The sensor type from enum amdxdna_power_mode_type
+ * @pad: MBZ.
+ */
+struct amdxdna_drm_set_power_mode {
+  __u8 power_mode;
+  __u8 pad[7];
+};
+
+enum amdxdna_drm_set_param {
+  DRM_AMDXDNA_SET_POWER_MODE,
+  DRM_AMDXDNA_WRITE_AIE_MEM,
+  DRM_AMDXDNA_WRITE_AIE_REG,
+  DRM_AMDXDNA_NUM_SET_PARAM,
+};
+
+/**
+ * struct amdxdna_drm_set_state - Set the state of some component within the AIE
+ * hardware.
+ * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed
+ * in the buffer.
+ * @buffer_size: Size of the input buffer.
+ * @buffer: A structure specified by the param struct member.
+ */
+struct amdxdna_drm_set_state {
+  __u32 param;       /* in */
+  __u32 buffer_size; /* in */
+  __u64 buffer;      /* in */
+};
+
+#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX                  \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
+           struct amdxdna_drm_create_hwctx)
+
+#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX                  \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \
+           struct amdxdna_drm_destroy_hwctx)
+
+#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX                  \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \
+           struct amdxdna_drm_config_hwctx)
+
+#define DRM_IOCTL_AMDXDNA_CREATE_BO                  \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \
+           struct amdxdna_drm_create_bo)
+
+#define DRM_IOCTL_AMDXDNA_GET_BO_INFO                  \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \
+           struct amdxdna_drm_get_bo_info)
+
+#define DRM_IOCTL_AMDXDNA_SYNC_BO \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo)
+
+#define DRM_IOCTL_AMDXDNA_EXEC_CMD \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd)
+
+#define DRM_IOCTL_AMDXDNA_WAIT_CMD \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd)
+
+#define DRM_IOCTL_AMDXDNA_GET_INFO \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info)
+
+#define DRM_IOCTL_AMDXDNA_SET_STATE                  \
+  DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \
+           struct amdxdna_drm_set_state)
+
+#if defined(__cplusplus)
+} /* extern c end */
+#endif
+
+#endif /* AMDXDNA_ACCEL_H_ */
diff --git a/experimental/hsa/hsa_ipu.h b/experimental/hsa/hsa_ipu.h
new file mode 100644
index 000000000..31176a34f
--- /dev/null
+++ b/experimental/hsa/hsa_ipu.h
@@ -0,0 +1,270 @@
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+
+#include "amdxdna_accel.h"
+
+// want to mmap the file
+#include <sys/io.h>
+#include <sys/mman.h>
+
+#define MAX_NUM_INSTRUCTIONS 1024  // Maximum number of dpu or pdi instructions.
+
+// Dummy packet defines
+
+int map_doorbell(int fd, uint64_t *doorbell) {
+  // Mmap the mailbox.
+  int32_t page_size = 4096;
+  *doorbell = (uint64_t)mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+                             MAP_SHARED, fd, 0);
+  if (doorbell != MAP_FAILED) {
+    printf("Doorbell mapped\n");
+    return 0;
+  }
+
+  printf("[ERROR] doorbell mmap failed: %s\n", strerror(errno));
+  return errno;
+}
+
+void ring_doorbell(uint64_t doorbell) {
+  int32_t curr_tail = *((int32_t *)doorbell);
+  *((uint32_t *)doorbell) = curr_tail + 0x94;
+}
+
+int get_driver_version(int fd, __u32 *major, __u32 *minor) {
+  int ret;
+  struct amdxdna_drm_query_aie_version version;
+
+  struct amdxdna_drm_get_info info_params = {
+      .param = DRM_AMDXDNA_QUERY_AIE_VERSION,
+      .buffer_size = sizeof(version),
+      .buffer = (__u64)&version,
+  };
+
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_INFO, &info_params);
+  if (ret == 0) {
+    *major = version.major;
+    *minor = version.minor;
+  }
+
+  return ret;
+}
+
+/*
+        Allocates a heap on the device by creating a BO of type dev heap
+*/
+static int alloc_heap(int fd, __u32 size, __u32 *handle) {
+  int ret;
+  void *heap_buf = NULL;
+  const size_t alignment = 64 * 1024 * 1024;
+  ret = posix_memalign(&heap_buf, alignment, size);
+  if (ret != 0 || heap_buf == NULL) {
+    printf("[ERROR] Failed to allocate heap buffer of size %d\n", size);
+  }
+
+  void *dev_heap_parent = mmap(0, alignment * 2 - 1, PROT_READ | PROT_WRITE,
+                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+  if (dev_heap_parent == MAP_FAILED) {
+    dev_heap_parent = nullptr;
+    return -1;
+  }
+
+  struct amdxdna_drm_create_bo create_bo_params = {
+      .type = AMDXDNA_BO_DEV_HEAP,
+      .size = size,
+  };
+
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_params);
+  if (ret == 0 && handle) {
+    *handle = create_bo_params.handle;
+  }
+
+  struct amdxdna_drm_get_bo_info get_bo_info = {.handle =
+                                                    create_bo_params.handle};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get BO info");
+    return -2;
+  }
+
+  // Need to free the heap buf but still use the address so we can
+  // ensure alignment
+  free(heap_buf);
+  heap_buf = (void *)mmap(heap_buf, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          fd, get_bo_info.map_offset);
+  printf("Heap buffer @:                  %p\n", heap_buf);
+
+  return ret;
+}
+
+/*
+        Creates a dev bo which is carved out of the heap bo.
+*/
+static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
+                         __u32 *handle, __u64 size_in_bytes) {
+  struct amdxdna_drm_create_bo create_bo = {
+      .type = AMDXDNA_BO_DEV,
+      .size = size_in_bytes,
+  };
+  int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo);
+  if (ret != 0) {
+    perror("Failed to create BO");
+    return -1;
+  }
+
+  struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get BO info");
+    return -2;
+  }
+
+  *vaddr = get_bo_info.vaddr;
+  *sram_vaddr = get_bo_info.xdna_addr;
+  *handle = create_bo.handle;
+  return 0;
+}
+
+/*
+        Creates a shmem bo
+*/
+static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
+                           __u32 *handle, __u64 size_in_bytes) {
+  const size_t alignment = 64 * 1024 * 1024;
+  void *shmem_create = NULL;
+  int ret = posix_memalign(&shmem_create, alignment, size_in_bytes);
+  if (ret != 0) {
+    printf("[ERROR] Failed to allocate shmem bo of size %lld\n", size_in_bytes);
+  }
+
+  // Touching buffer to map page
+  *(uint32_t *)shmem_create = 0xDEADBEEF;
+
+  printf("Shmem BO @:                     %p\n", shmem_create);
+
+  struct amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM,
+                                            .vaddr = (__u64)shmem_create,
+                                            .size = size_in_bytes};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo);
+  if (ret != 0) {
+    perror("Failed to create BO");
+    return -1;
+  }
+
+  struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
+  ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
+  if (ret != 0) {
+    perror("Failed to get BO info");
+    return -2;
+  }
+
+  *vaddr = (__u64)shmem_create;
+  *sram_vaddr = get_bo_info.xdna_addr;
+  *handle = create_bo.handle;
+  return 0;
+}
+
+/*
+  Wrapper around synch bo ioctl.
+*/
+static int sync_bo(int fd, __u32 handle) {
+  struct amdxdna_drm_sync_bo sync_params = {
+      .handle = handle,
+  };
+  int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params);
+  if (ret != 0) {
+    printf("Synch bo ioctl failed for handle %d\n", handle);
+  }
+  return ret;
+}
+
+/*
+  Create a BO_DEV and populate it with a PDI
+*/
+
+static int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle,
+                    const char *path) {
+  FILE *file = fopen(path, "r");
+  if (file == NULL) {
+    perror("Failed to open instructions file.");
+    return -1;
+  }
+
+  fseek(file, 0L, SEEK_END);
+  ssize_t file_size = ftell(file);
+  fseek(file, 0L, SEEK_SET);
+
+  printf("Pdi file size: %ld\n", file_size);
+
+  fclose(file);
+
+  // Mmaping the file
+  int pdi_fd = open(path, O_RDONLY);
+  uint64_t *file_data =
+      (uint64_t *)mmap(0, file_size, PROT_READ, MAP_PRIVATE, pdi_fd, 0);
+
+  // Creating a BO_DEV bo to store the pdi file.
+  int ret = create_dev_bo(fd, vaddr, sram_addr, handle, file_size);
+  if (ret != 0) {
+    perror("Failed to create pdi BO");
+    return -1;
+  }
+
+  // copy the file into Bo dev
+  uint64_t *bo = (uint64_t *)*vaddr;
+  memcpy(bo, file_data, file_size);
+
+  close(pdi_fd);
+  return 0;
+}
+
+/*
+  Create a BO DEV and populate it with instructions whose virtual address is
+  passed to the driver via an HSA packet.
+*/
+static int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr,
+                             __u32 *handle, const char *path, __u32 *num_inst) {
+  // read dpu instructions into an array
+  FILE *file = fopen(path, "r");
+  if (file == NULL) {
+    perror("Failed to open instructions file.");
+    return -1;
+  }
+
+  char *line = NULL;
+  size_t len = 0;
+  __u32 inst_array[MAX_NUM_INSTRUCTIONS];
+  __u32 inst_counter = 0;
+  while (getline(&line, &len, file) != -1) {
+    inst_array[inst_counter++] = strtoul(line, NULL, 16);
+    if (inst_counter >= MAX_NUM_INSTRUCTIONS) {
+      perror("Instruction array overflowed.");
+      return -2;
+    }
+  }
+  fclose(file);
+
+  // Creating a BO_DEV bo to store the instruction.
+  int ret =
+      create_dev_bo(fd, vaddr, sram_addr, handle, inst_counter * sizeof(__u32));
+  if (ret != 0) {
+    perror("Failed to create dpu BO");
+    return -3;
+  }
+
+  *num_inst = inst_counter;
+
+  memcpy((__u32 *)*vaddr, inst_array, inst_counter * sizeof(__u32));
+  return ret;
+}

From 0aab102b2c766a17be892ec39515ab5a7a5a472e Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Tue, 27 Aug 2024 22:22:03 -0500
Subject: [PATCH 3/3] cleanup

---
 .github/workflows/ci-linux.yml           | 26 ++++++-
 experimental/CMakeLists.txt              |  4 +
 experimental/hsa/aie_hsa_bare_add_one.cc | 98 ++++++++----------------
 experimental/hsa/hsa_ipu.h               | 53 ++++++-------
 4 files changed, 84 insertions(+), 97 deletions(-)

diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index 5071219ab..835151081 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -201,11 +201,13 @@ jobs:
             print_ir_aie2xclbin_results \
             $PWD/llvm-aie
 
-  test_hsa:
+  test_hsa_full:
     name: HSA MWE
     strategy:
       fail-fast: false
-    runs-on: linux-phoenix-20240606
+      matrix:
+        runs-on: [linux-phoenix-20240606]
+    runs-on: ${{ matrix.runs-on }}
     steps:
       - name: "Checking out repository" # for test scripts
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
@@ -219,6 +221,24 @@ jobs:
           cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja
           ninja aie_hsa_dispatch_test
           ./aie_hsa_dispatch_test $PWD/..
-          
+
+  test_hsa_bare:
+    name: HSA MWE
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [linux-phoenix-20240819]
+    runs-on: ${{ matrix.runs-on }}
+    steps:
+      - name: "Checking out repository" # for test scripts
+        uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
+        with:
+          submodules: false # not required for testbench
+
+      - name: Build and run HSA example
+        run: |
+          cd experimental/hsa
+          mkdir build && pushd build
+          cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja
           ninja aie_hsa_bare_add_one
           ./aie_hsa_bare_add_one $PWD/..
diff --git a/experimental/CMakeLists.txt b/experimental/CMakeLists.txt
index dee0aab59..0f53224b1 100644
--- a/experimental/CMakeLists.txt
+++ b/experimental/CMakeLists.txt
@@ -5,3 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 add_subdirectory(delegate)
+find_package(hsa-runtime64 QUIET)
+if(${hsa-runtime64_FOUND})
+  add_subdirectory(hsa)
+endif()
\ No newline at end of file
diff --git a/experimental/hsa/aie_hsa_bare_add_one.cc b/experimental/hsa/aie_hsa_bare_add_one.cc
index f5843d7e9..ad41567b0 100644
--- a/experimental/hsa/aie_hsa_bare_add_one.cc
+++ b/experimental/hsa/aie_hsa_bare_add_one.cc
@@ -1,39 +1,12 @@
-
-/*
-
-RUN: (add_one_test %S) | FileCheck %s
-CHECK: /dev/accel/accel0 open
-CHECK: Driver version 1.1
-CHECK: Heap buffer @:                  0x7f313c000000
-CHECK: Loading pdi
-CHECK: Pdi file size: 3552
-CHECK: Loading dpu inst
-CHECK: Loading dpu inst
-CHECK: DPU 0 instructions @:             0x7f313c008000
-CHECK: DPU 1 instructions @:             0x7f313c010000
-CHECK: PDI file @:                     0x7f313c000000
-CHECK: PDI handle @:                     2
-CHECK: Input @:             0x7f313c018000
-CHECK: Output @:             0x7f313c020000
-CHECK: Input @:             0x7f313c028000
-CHECK: Output @:             0x7f313c030000
-CHECK: Size of param_config_cu: 0x8
-CHECK: Synch bo ioctl failed for handle 11
-CHECK: Synch bo ioctl failed for handle 9
-CHECK: Synch bo ioctl failed for handle 10
-CHECK: Checking run 0:
-CHECK: Checking run 1:
-CHECK: PASS!
-CHECK: Closing
-CHECK: Done
-
- */
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <cstdint>
 #include <cstdlib>
-#include <ctime>
 #include <fstream>
-#include <iostream>
 
 #include "amdxdna_accel.h"
 #include "hsa_ipu.h"
@@ -174,8 +147,6 @@ int main(int argc, char **argv) {
   uint32_t input_0_handle;
   ret = create_dev_bo(drv_fd, &input_0, &input_0_sram_vaddr, &input_0_handle,
                       DATA_BUFFER_SIZE);
-  // ret = create_shmem_bo(drv_fd, &input_0, &input_0_sram_vaddr,
-  // &input_0_handle, DATA_BUFFER_SIZE);
   printf("Input @:             %p\n", (void *)input_0);
   if (ret < 0) {
     printf("Error %i creating data 0\n", ret);
@@ -190,8 +161,6 @@ int main(int argc, char **argv) {
   uint32_t output_0_handle;
   ret = create_dev_bo(drv_fd, &output_0, &output_0_sram_vaddr, &output_0_handle,
                       DATA_BUFFER_SIZE);
-  // ret = create_shmem_bo(drv_fd, &output_0, &output_0_sram_vaddr,
-  // &output_0_handle, DATA_BUFFER_SIZE);
   printf("Output @:             %p\n", (void *)output_0);
   if (ret < 0) {
     printf("Error %i creating data 1\n", ret);
@@ -206,8 +175,6 @@ int main(int argc, char **argv) {
   uint32_t input_1_handle;
   ret = create_dev_bo(drv_fd, &input_1, &input_1_sram_vaddr, &input_1_handle,
                       DATA_BUFFER_SIZE);
-  // ret = create_shmem_bo(drv_fd, &input_1, &input_1_sram_vaddr,
-  // &input_1_handle, DATA_BUFFER_SIZE);
   printf("Input @:             %p\n", (void *)input_1);
   if (ret < 0) {
     printf("Error %i creating data 0\n", ret);
@@ -222,8 +189,6 @@ int main(int argc, char **argv) {
   uint32_t output_1_handle;
   ret = create_dev_bo(drv_fd, &output_1, &output_1_sram_vaddr, &output_1_handle,
                       DATA_BUFFER_SIZE);
-  // ret = create_shmem_bo(drv_fd, &output_1, &output_1_sram_vaddr,
-  // &output_1_handle, DATA_BUFFER_SIZE);
   printf("Output @:             %p\n", (void *)output_1);
   if (ret < 0) {
     printf("Error %i creating data 1\n", ret);
@@ -261,7 +226,7 @@ int main(int argc, char **argv) {
   // 2. Allocate the queue buffer as a user-mode queue
 
   // Allocating a structure to store QOS information
-  struct amdxdna_qos_info *qos =
+  amdxdna_qos_info *qos =
       (struct amdxdna_qos_info *)malloc(sizeof(struct amdxdna_qos_info));
   qos->gops = 0;
   qos->fps = 0;
@@ -271,7 +236,7 @@ int main(int argc, char **argv) {
   qos->priority = 0;
 
   // This is the structure that we pass
-  struct amdxdna_drm_create_hwctx create_hw_ctx = {
+  amdxdna_drm_create_hwctx create_hw_ctx = {
       .ext = 0,
       .ext_flags = 0,
       .qos_p = (uint64_t)qos,
@@ -289,25 +254,26 @@ int main(int argc, char **argv) {
   }
 
   // Creating a structure to configure the CU
-  struct amdxdna_cu_config cu_config = {
+  amdxdna_cu_config cu_config = {
       .cu_bo = pdi_handle,
       .cu_func = 0,
   };
 
   // Creating a structure to configure the hardware context
-  struct amdxdna_hwctx_param_config_cu param_config_cu;
+  amdxdna_hwctx_param_config_cu param_config_cu;
   param_config_cu.num_cus = 1;
   param_config_cu.cu_configs[0] = cu_config;
 
   printf("Size of param_config_cu: 0x%lx\n", sizeof(param_config_cu));
 
   // Configuring the hardware context with the PDI
-  struct amdxdna_drm_config_hwctx config_hw_ctx = {
+  amdxdna_drm_config_hwctx config_hw_ctx = {
       .handle = create_hw_ctx.handle,
       .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU,
-      .param_val =
-          (uint64_t)&param_config_cu,  // Pass in the pointer to the param value
-      .param_val_size = 0x10,          // Size of param config CU is 16B
+      // Pass in the pointer to the param value
+      .param_val = (uint64_t)&param_config_cu,
+      // Size of param config CU is 16B
+      .param_val_size = 0x10,
   };
   ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx);
   if (ret != 0) {
@@ -317,7 +283,7 @@ int main(int argc, char **argv) {
 
   /////////////////////////////////////////////////////////////////////////////////
   // Step 2: Configuring the CMD BOs with the different instruction sequences
-  struct amdxdna_drm_create_bo create_cmd_bo_0 = {
+  amdxdna_drm_create_bo create_cmd_bo_0 = {
       .type = AMDXDNA_BO_CMD,
       .size = PACKET_SIZE,
   };
@@ -327,8 +293,8 @@ int main(int argc, char **argv) {
     return -1;
   }
 
-  struct amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = {
-      .handle = create_cmd_bo_0.handle};
+  amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = {.handle =
+                                                      create_cmd_bo_0.handle};
   ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_0_get_bo_info);
   if (ret != 0) {
     perror("Failed to get cmd BO 0 info");
@@ -336,7 +302,7 @@ int main(int argc, char **argv) {
   }
 
   // Writing the first packet to the queue
-  struct amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap(
+  amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap(
       0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd,
       cmd_bo_0_get_bo_info.map_offset);
   cmd_0->state = 1;  // ERT_CMD_STATE_NEW;
@@ -355,7 +321,7 @@ int main(int argc, char **argv) {
   cmd_0->data[9] = (output_0 >> 32) & 0xFFFFFFFF;  // Output high
 
   // Writing to the second packet of the queue
-  struct amdxdna_drm_create_bo create_cmd_bo_1 = {
+  amdxdna_drm_create_bo create_cmd_bo_1 = {
       .type = AMDXDNA_BO_CMD,
       .size = PACKET_SIZE,
   };
@@ -365,15 +331,15 @@ int main(int argc, char **argv) {
     return -1;
   }
 
-  struct amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = {
-      .handle = create_cmd_bo_1.handle};
+  amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = {.handle =
+                                                      create_cmd_bo_1.handle};
   ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_1_get_bo_info);
   if (ret != 0) {
     perror("Failed to get cmd BO 0 info");
     return -2;
   }
 
-  struct amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap(
+  amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap(
       0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd,
       cmd_bo_1_get_bo_info.map_offset);
   cmd_1->state = 1;  // ERT_CMD_STATE_NEW;
@@ -396,13 +362,13 @@ int main(int argc, char **argv) {
   // the command chain that points to the instruction sequences just created
 
   // Allocate a command chain
-  void *bo_cmd_chain_buf = NULL;
+  void *bo_cmd_chain_buf = nullptr;
   cmd_bo_ret = posix_memalign(&bo_cmd_chain_buf, 4096, 4096);
-  if (cmd_bo_ret != 0 || bo_cmd_chain_buf == NULL) {
+  if (cmd_bo_ret != 0 || bo_cmd_chain_buf == nullptr) {
     printf("[ERROR] Failed to allocate cmd_bo buffer of size %d\n", 4096);
   }
 
-  struct amdxdna_drm_create_bo create_cmd_chain_bo = {
+  amdxdna_drm_create_bo create_cmd_chain_bo = {
       .type = AMDXDNA_BO_CMD,
       .size = 4096,
   };
@@ -412,7 +378,7 @@ int main(int argc, char **argv) {
     return -1;
   }
 
-  struct amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = {
+  amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = {
       .handle = create_cmd_chain_bo.handle};
   ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_chain_bo_get_bo_info);
   if (ret != 0) {
@@ -420,12 +386,12 @@ int main(int argc, char **argv) {
     return -2;
   }
 
-  struct amdxdna_cmd *cmd_chain =
+  amdxdna_cmd *cmd_chain =
       (struct amdxdna_cmd *)mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED,
                                  drv_fd, cmd_chain_bo_get_bo_info.map_offset);
 
   // Writing information to the command buffer
-  struct amdxdna_cmd_chain *cmd_chain_payload =
+  amdxdna_cmd_chain *cmd_chain_payload =
       (struct amdxdna_cmd_chain *)(cmd_chain->data);
   cmd_chain->state = 1;  // ERT_CMD_STATE_NEW;
   cmd_chain->extra_cu_masks = 0;
@@ -445,7 +411,7 @@ int main(int argc, char **argv) {
   // Perform a submit cmd
   uint32_t bo_args[6] = {dpu_0_handle,    dpu_1_handle,   input_0_handle,
                          output_0_handle, input_1_handle, output_1_handle};
-  struct amdxdna_drm_exec_cmd exec_cmd_0 = {
+  amdxdna_drm_exec_cmd exec_cmd_0 = {
       .ext = 0,
       .ext_flags = 0,
       .hwctx = create_hw_ctx.handle,
@@ -464,7 +430,7 @@ int main(int argc, char **argv) {
   /////////////////////////////////////////////////////////////////////////////////
   // Step 4: Wait for the output
   // Use the wait IOCTL to wait for our submission to complete
-  struct amdxdna_drm_wait_cmd wait_cmd = {
+  amdxdna_drm_wait_cmd wait_cmd = {
       .hwctx = create_hw_ctx.handle,
       .timeout = 50,  // 50ms timeout
       .seq = exec_cmd_0.seq,
@@ -490,8 +456,6 @@ int main(int argc, char **argv) {
   for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) {
     uint32_t src = *((uint32_t *)input_0 + i);
     uint32_t dst = *((uint32_t *)output_0 + i);
-    // printf("src: 0x%x\n", src);
-    // printf("dst: 0x%x\n", dst);
     if (src + 1 != dst) {
       printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst);
       errors++;
@@ -502,8 +466,6 @@ int main(int argc, char **argv) {
   for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) {
     uint32_t src = *((uint32_t *)input_1 + i);
     uint32_t dst = *((uint32_t *)output_1 + i);
-    // printf("src: 0x%x\n", src);
-    // printf("dst: 0x%x\n", dst);
     if (src + 1 != dst) {
       printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst);
       errors++;
diff --git a/experimental/hsa/hsa_ipu.h b/experimental/hsa/hsa_ipu.h
index 31176a34f..4ba2505e6 100644
--- a/experimental/hsa/hsa_ipu.h
+++ b/experimental/hsa/hsa_ipu.h
@@ -1,3 +1,9 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#pragma once
 
 #include <fcntl.h>
 #include <sys/ioctl.h>
@@ -15,8 +21,6 @@
 #include "amdxdna_accel.h"
 
 // want to mmap the file
-#include <sys/io.h>
-#include <sys/mman.h>
 
 #define MAX_NUM_INSTRUCTIONS 1024  // Maximum number of dpu or pdi instructions.
 
@@ -43,9 +47,9 @@ void ring_doorbell(uint64_t doorbell) {
 
 int get_driver_version(int fd, __u32 *major, __u32 *minor) {
   int ret;
-  struct amdxdna_drm_query_aie_version version;
+  amdxdna_drm_query_aie_version version;
 
-  struct amdxdna_drm_get_info info_params = {
+  amdxdna_drm_get_info info_params = {
       .param = DRM_AMDXDNA_QUERY_AIE_VERSION,
       .buffer_size = sizeof(version),
       .buffer = (__u64)&version,
@@ -63,7 +67,7 @@ int get_driver_version(int fd, __u32 *major, __u32 *minor) {
 /*
         Allocates a heap on the device by creating a BO of type dev heap
 */
-static int alloc_heap(int fd, __u32 size, __u32 *handle) {
+int alloc_heap(int fd, __u32 size, __u32 *handle) {
   int ret;
   void *heap_buf = NULL;
   const size_t alignment = 64 * 1024 * 1024;
@@ -80,7 +84,7 @@ static int alloc_heap(int fd, __u32 size, __u32 *handle) {
     return -1;
   }
 
-  struct amdxdna_drm_create_bo create_bo_params = {
+  amdxdna_drm_create_bo create_bo_params = {
       .type = AMDXDNA_BO_DEV_HEAP,
       .size = size,
   };
@@ -90,8 +94,7 @@ static int alloc_heap(int fd, __u32 size, __u32 *handle) {
     *handle = create_bo_params.handle;
   }
 
-  struct amdxdna_drm_get_bo_info get_bo_info = {.handle =
-                                                    create_bo_params.handle};
+  amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo_params.handle};
   ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
   if (ret != 0) {
     perror("Failed to get BO info");
@@ -111,9 +114,9 @@ static int alloc_heap(int fd, __u32 size, __u32 *handle) {
 /*
         Creates a dev bo which is carved out of the heap bo.
 */
-static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
-                         __u32 *handle, __u64 size_in_bytes) {
-  struct amdxdna_drm_create_bo create_bo = {
+int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, __u32 *handle,
+                  __u64 size_in_bytes) {
+  amdxdna_drm_create_bo create_bo = {
       .type = AMDXDNA_BO_DEV,
       .size = size_in_bytes,
   };
@@ -123,7 +126,7 @@ static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
     return -1;
   }
 
-  struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
+  amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
   ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
   if (ret != 0) {
     perror("Failed to get BO info");
@@ -139,8 +142,8 @@ static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
 /*
         Creates a shmem bo
 */
-static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
-                           __u32 *handle, __u64 size_in_bytes) {
+int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
+                    __u32 *handle, __u64 size_in_bytes) {
   const size_t alignment = 64 * 1024 * 1024;
   void *shmem_create = NULL;
   int ret = posix_memalign(&shmem_create, alignment, size_in_bytes);
@@ -153,16 +156,16 @@ static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
 
   printf("Shmem BO @:                     %p\n", shmem_create);
 
-  struct amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM,
-                                            .vaddr = (__u64)shmem_create,
-                                            .size = size_in_bytes};
+  amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM,
+                                     .vaddr = (__u64)shmem_create,
+                                     .size = size_in_bytes};
   ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo);
   if (ret != 0) {
     perror("Failed to create BO");
     return -1;
   }
 
-  struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
+  amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle};
   ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info);
   if (ret != 0) {
     perror("Failed to get BO info");
@@ -178,10 +181,8 @@ static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr,
 /*
   Wrapper around synch bo ioctl.
 */
-static int sync_bo(int fd, __u32 handle) {
-  struct amdxdna_drm_sync_bo sync_params = {
-      .handle = handle,
-  };
+int sync_bo(int fd, __u32 handle) {
+  amdxdna_drm_sync_bo sync_params = {.handle = handle};
   int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params);
   if (ret != 0) {
     printf("Synch bo ioctl failed for handle %d\n", handle);
@@ -193,8 +194,8 @@ static int sync_bo(int fd, __u32 handle) {
   Create a BO_DEV and populate it with a PDI
 */
 
-static int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle,
-                    const char *path) {
+int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle,
+             const char *path) {
   FILE *file = fopen(path, "r");
   if (file == NULL) {
     perror("Failed to open instructions file.");
@@ -233,8 +234,8 @@ static int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle,
   Create a BO DEV and populate it with instructions whose virtual address is
   passed to the driver via an HSA packet.
 */
-static int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr,
-                             __u32 *handle, const char *path, __u32 *num_inst) {
+int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr,
+                      __u32 *handle, const char *path, __u32 *num_inst) {
   // read dpu instructions into an array
   FILE *file = fopen(path, "r");
   if (file == NULL) {