Skip to content

Commit

Permalink
Merge branch 'main' into zhewen_batch_sync
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-Zhewen authored Dec 16, 2024
2 parents 344b796 + f7cd097 commit ac2a7ec
Show file tree
Hide file tree
Showing 10 changed files with 217 additions and 28 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ Build and install `xdna-driver`, use commit `59f1d62`:
git clone [email protected]:amd/xdna-driver.git
cd <root-of-source-tree>
# get code for submodules
git checkout 59f1d62
git checkout 929e8ab
git submodule update --init --recursive
```

Follow the instructions to build and install the driver module: [xdna-driver](https://github.com/amd/xdna-driver/tree/59f1d6235334499b22dbd056a60ab00bfec142ee).
Follow the instructions to build and install the driver module: [xdna-driver](https://github.com/amd/xdna-driver/tree/929e8ab459cab5915631849b9f1ef9a4982d1c11).

## Building (along with IREE)

Expand Down
12 changes: 6 additions & 6 deletions build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ class Union(ctypes.Union, AsDictMixin):
4: "DRM_AMDXDNA_GET_BO_INFO",
5: "DRM_AMDXDNA_SYNC_BO",
6: "DRM_AMDXDNA_EXEC_CMD",
7: "DRM_AMDXDNA_WAIT_CMD",
8: "DRM_AMDXDNA_GET_INFO",
9: "DRM_AMDXDNA_SET_STATE",
7: "DRM_AMDXDNA_GET_INFO",
8: "DRM_AMDXDNA_SET_STATE",
9: "DRM_AMDXDNA_WAIT_CMD",
10: "DRM_AMDXDNA_SUBMIT_WAIT",
11: "DRM_AMDXDNA_SUBMIT_SIGNAL",
12: "DRM_AMDXDNA_NUM_IOCTLS",
Expand All @@ -143,9 +143,9 @@ class Union(ctypes.Union, AsDictMixin):
DRM_AMDXDNA_GET_BO_INFO = 4
DRM_AMDXDNA_SYNC_BO = 5
DRM_AMDXDNA_EXEC_CMD = 6
DRM_AMDXDNA_WAIT_CMD = 7
DRM_AMDXDNA_GET_INFO = 8
DRM_AMDXDNA_SET_STATE = 9
DRM_AMDXDNA_GET_INFO = 7
DRM_AMDXDNA_SET_STATE = 8
DRM_AMDXDNA_WAIT_CMD = 9
DRM_AMDXDNA_SUBMIT_WAIT = 10
DRM_AMDXDNA_SUBMIT_SIGNAL = 11
DRM_AMDXDNA_NUM_IOCTLS = 12
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// input ${M}x${K}x${TYPE1}
// input ${N}x${K}x${TYPE1}

func.func @matmul_transpose_b(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${N}x${K}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE2}>
{
%cst = arith.constant ${ZERO} : ${TYPE2}
%0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
%1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
%2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${N}x${K}x${TYPE1}>)
outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
return %2: tensor<${M}x${N}x${TYPE2}>
}
67 changes: 63 additions & 4 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,43 @@ def _execute(self, config):
return self.benchmark(config)


class MatmulTransposeB(BaseMatmul):
"""
A test of the form matmul_transpose_b(A,B) where A:MxK, B:NxK
"""

def __init__(
self,
M,
N,
K,
input_type,
acc_type,
use_ukernel=False,
run_on_target=["npu1_4col"],
):
super().__init__(
run_on_target=run_on_target,
aie_compilation_flags=None,
M=M,
N=N,
K=K,
input_type=input_type,
acc_type=acc_type,
)
self.labels.append("MatmulTransposeB")

self.name = f"matmul_transpose_b_{M}_{N}_{K}_{input_type}_{acc_type}"

def _execute(self, config):
matmul_template_dir = config.file_dir / "matmul_template"
template_name = matmul_template_dir / "matmul_transpose_b_MxK_NxK.mlir"
self.generate(config, template_name)
self.vs_cpu(config)

return True


class MatmulThinBias(BaseMatmul):
"""
A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:N
Expand Down Expand Up @@ -1412,6 +1449,15 @@ def __init__(self):
self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32", use_ukernel=True))
self.register(MatmulThinBias(1024, 1024, 512, "bf16", "f32"))

# MatmulFullBias test:
self.register(MatmulFullBias(128, 128, 256, "i32", "i32"))

# MatmulTransposeB test(s):
for input_type, acc_type in zip(["i8", "bf16"], ["i32", "f32"]):
self.register(MatmulTransposeB(32, 32, 32, input_type, acc_type))
self.register(MatmulTransposeB(128, 256, 128, input_type, acc_type))
self.register(MatmulTransposeB(1536, 1536, 2048, input_type, acc_type))

# Matmul test(s):
self.register(
Matmul(
Expand Down Expand Up @@ -1458,6 +1504,22 @@ def __init__(self):
)
)

# Matmul test on 2(rows)x2(cols) cores
self.register(
Matmul(
32,
32,
32,
"bf16",
"f32",
aie_compilation_flags=[
"--iree-amdaie-num-rows=2",
"--iree-amdaie-num-cols=2",
],
name_suffix="2rows_2cols",
)
)

performance_tests = [
{
"M": 512,
Expand Down Expand Up @@ -1655,9 +1717,6 @@ def __init__(self):
for name in ["two_matmul_switching", "matmul_f32_8_8_4", "matmul_f32_8_4_8"]:
self.register(MultipleDispatches(name))

# MatmulFullBias test:
self.register(MatmulFullBias(128, 128, 256, "i32", "i32"))

# Convolution NHCWQ test:
self.register(ConvolutionNHWCQ())

Expand Down Expand Up @@ -1718,7 +1777,7 @@ def all_tests(
that directory.
3) create a new matmul template in `./matmul_template`, for example if you
want to add a new variant with tranposed operands or unary elementwise
want to add a new variant with transposed operands or unary elementwise
operations.
4) create a new template generator, duplicating the directory structure of
Expand Down
1 change: 0 additions & 1 deletion build_tools/ci/run_matmul_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,6 @@ run_matmul_test_on_shapes ${bf16_i8_shapes_medium[@]} \
--num_repeat_runs "2"



# note this will not actually show any devices because --xrt_lite_n_core_rows --xrt_lite_n_core_cols are not passed
# which i have omitted to make the conditional slightly more succinct
if [[ $($IREE_INSTALL_DIR/bin/iree-benchmark-module --dump_devices | grep xrt-lite) ]]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ enum amdxdna_drm_ioctl_id {
DRM_AMDXDNA_GET_BO_INFO,
DRM_AMDXDNA_SYNC_BO,
DRM_AMDXDNA_EXEC_CMD,
DRM_AMDXDNA_WAIT_CMD,
DRM_AMDXDNA_GET_INFO,
DRM_AMDXDNA_SET_STATE,
DRM_AMDXDNA_WAIT_CMD,
DRM_AMDXDNA_NUM_IOCTLS
};

Expand All @@ -57,6 +57,23 @@ enum amdxdna_device_type {
AMDXDNA_DEV_TYPE_UMQ,
};

/*
* Enum for priority in application's QoS. Values copied from Window shim layer.
* AMDXDNA_QOS_DEFAULT_PRIORITY: Default priority.
* AMDXDNA_QOS_REALTIME_PRIORITY: Real time clients.
* AMDXDNA_QOS_HIGH_PRIORITY: Best effort foreground clients.
* AMDXDNA_QOS_NORMAL_PRIORITY: Best effort or background clients.
* AMDXDNA_QOS_LOW_PRIORITY: Clients that can wait indefinite amount of time for
* completion.
*/
enum amdxdna_qos_priority {
AMDXDNA_QOS_DEFAULT_PRIORITY = 0x0,
AMDXDNA_QOS_REALTIME_PRIORITY = 0x100,
AMDXDNA_QOS_HIGH_PRIORITY = 0x180,
AMDXDNA_QOS_NORMAL_PRIORITY = 0x200,
AMDXDNA_QOS_LOW_PRIORITY = 0x280
};

/**
* struct qos_info - QoS information for driver.
* @gops: Giga operations per second.
Expand Down Expand Up @@ -89,6 +106,8 @@ struct amdxdna_qos_info {
* @mem_size: Size of AIE tile memory.
* @umq_doorbell: Returned offset of doorbell associated with UMQ.
* @handle: Returned hardware context handle.
* @syncobj_handle: The drm timeline syncobj handle for command completion
* notification.
* @pad: Structure padding.
*/
struct amdxdna_drm_create_hwctx {
Expand All @@ -102,6 +121,7 @@ struct amdxdna_drm_create_hwctx {
__u32 mem_size;
__u32 umq_doorbell;
__u32 handle;
__u32 syncobj_handle;
__u32 pad;
};

Expand Down Expand Up @@ -156,7 +176,7 @@ enum amdxdna_drm_config_hwctx_param {
* @param_val_size: Size of the parameter buffer pointed to by the param_val.
* If param_val is not a pointer, driver can ignore this.
* @pad: Structure padding.
*
*
* Note: if the param_val is a pointer pointing to a buffer, the maximum size
* of the buffer is 4KiB(PAGE_SIZE).
*/
Expand Down Expand Up @@ -493,6 +513,17 @@ struct amdxdna_drm_query_firmware_version {
__u32 build; /* out */
};

/**
* struct amdxdna_drm_get_force_preempt_state - Get force preemption state.
* @force_preempt_state: 1 implies force preemption is enabled.
* 0 implies disabled.
* @pad: MBZ.
*/
struct amdxdna_drm_get_force_preempt_state {
__u8 state;
__u8 pad[7];
};

enum amdxdna_drm_get_param {
DRM_AMDXDNA_QUERY_AIE_STATUS,
DRM_AMDXDNA_QUERY_AIE_METADATA,
Expand All @@ -505,6 +536,7 @@ enum amdxdna_drm_get_param {
DRM_AMDXDNA_QUERY_FIRMWARE_VERSION,
DRM_AMDXDNA_GET_POWER_MODE,
DRM_AMDXDNA_QUERY_TELEMETRY,
DRM_AMDXDNA_GET_FORCE_PREEMPT_STATE,
DRM_AMDXDNA_NUM_GET_PARAM,
};

Expand All @@ -531,10 +563,22 @@ struct amdxdna_drm_set_power_mode {
__u8 pad[7];
};

/**
* struct amdxdna_drm_set_force_preempt_state - set force preemption state
* @force_preempt_state: 1 implies force preemption is enabled.
* 0 implies disabled
* @pad: MBZ.
*/
struct amdxdna_drm_set_force_preempt_state {
__u8 state;
__u8 pad[7];
};

enum amdxdna_drm_set_param {
DRM_AMDXDNA_SET_POWER_MODE,
DRM_AMDXDNA_WRITE_AIE_MEM,
DRM_AMDXDNA_WRITE_AIE_REG,
DRM_AMDXDNA_SET_FORCE_PREEMPT,
DRM_AMDXDNA_NUM_SET_PARAM,
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ uint32_t bo::get_arg_bo_handles(uint32_t *handles, size_t num) const {
shim_err(E2BIG, "There are %ld BO args, provided buffer can hold only %ld",
sz, num);

for (auto m : m_args_map) *(handles++) = m.second;
for (auto &m : m_args_map) *(handles++) = m.second;

return sz;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ hw_ctx::hw_ctx(device &dev, const std::map<std::string, uint32_t> &qos,
m_num_rows(n_rows),
m_num_cols(n_cols),
m_doorbell(0),
m_syncobj(AMDXDNA_INVALID_FENCE_HANDLE),
m_log_buf(nullptr) {
SHIM_DEBUG("Creating HW context...");

Expand Down Expand Up @@ -92,6 +93,7 @@ hw_ctx::hw_ctx(device &device, const std::vector<uint8_t> &pdi,

hw_ctx::~hw_ctx() {
delete_ctx_on_device();
delete_syncobj();
SHIM_DEBUG("Destroyed HW context (%d)...", m_handle);
SHIM_DEBUG("Destroying KMQ HW context (%d)...", m_handle);
}
Expand Down Expand Up @@ -134,6 +136,7 @@ void hw_ctx::create_ctx_on_device() {

m_handle = arg.handle;
m_doorbell = arg.umq_doorbell;
m_syncobj = arg.syncobj_handle;

m_q->bind_hwctx(this);
}
Expand All @@ -149,17 +152,40 @@ void hw_ctx::delete_ctx_on_device() const {
fini_log_buf();
}

void hw_ctx::delete_syncobj() const {
if (m_syncobj == AMDXDNA_INVALID_FENCE_HANDLE) return;
drm_syncobj_destroy dsobj = {.handle = m_syncobj};
m_device.get_pdev().ioctl(DRM_IOCTL_SYNCOBJ_DESTROY, &dsobj);
}

void hw_ctx::init_log_buf() {
auto log_buf_size = m_num_cols * 1024;
size_t column_size = 1024;
auto log_buf_size = m_num_cols * column_size + sizeof(m_metadata);
shim_xcl_bo_flags f;
f.flags = XCL_BO_FLAGS_EXECBUF;
m_log_bo = alloc_bo(log_buf_size, f);
m_log_buf = m_log_bo->map();
uint64_t bo_paddr = m_log_bo->get_properties().paddr;
set_metadata(m_num_cols, column_size, bo_paddr, 1);
std::memset(m_log_buf, 0, log_buf_size);
std::memcpy(m_log_buf, &m_metadata, sizeof(m_metadata));
}

void hw_ctx::fini_log_buf() const {
if (m_log_bo) m_log_bo->unmap(m_log_buf);
}

void hw_ctx::set_metadata(int num_cols, size_t size, uint64_t bo_paddr,
uint8_t flag) {
m_metadata.magic_no = CERT_MAGIC_NO;
m_metadata.major = 0;
m_metadata.minor = 1;
m_metadata.cert_log_flag = flag;
m_metadata.num_cols = num_cols;
for (int i = 0; i < num_cols; i++) {
m_metadata.col_paddr[i] = bo_paddr + size * i + sizeof(m_metadata);
m_metadata.col_size[i] = size;
}
}

} // namespace shim_xdna
18 changes: 18 additions & 0 deletions runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@ struct hw_q;
struct bo;
struct device;

enum cert_log_flag { debug_buffer = 0, trace_buffer };

struct cert_log_metadata {
#define CERT_MAGIC_NO 0x43455254 // "CERT"
uint32_t magic_no;
uint8_t major;
uint8_t minor;
uint8_t cert_log_flag;
uint8_t num_cols; // how many valid cols, up to 8 for now
uint64_t col_paddr[8]; // device accessible address array for each valid col
uint32_t col_size[8]; // bo size for each valid col
};

struct cu_info {
std::string m_name;
size_t m_func;
Expand All @@ -40,11 +53,13 @@ struct hw_ctx {
uint32_t m_handle = AMDXDNA_INVALID_CTX_HANDLE;
amdxdna_qos_info m_qos = {};
std::vector<cu_info> m_cu_info;
cert_log_metadata m_metadata;
std::unique_ptr<hw_q> m_q;
uint32_t m_ops_per_cycle;
uint32_t m_num_rows;
uint32_t m_num_cols;
uint32_t m_doorbell;
uint32_t m_syncobj;
std::unique_ptr<bo> m_log_bo;
void *m_log_buf;
std::vector<std::unique_ptr<bo>> m_pdi_bos;
Expand All @@ -68,8 +83,11 @@ struct hw_ctx {
void init_log_buf();
void fini_log_buf() const;
void delete_ctx_on_device() const;
void delete_syncobj() const;

hw_q *get_hw_queue() const;

void set_metadata(int num_cols, size_t size, uint64_t bo_paddr, uint8_t flag);
};

} // namespace shim_xdna
Expand Down
Loading

0 comments on commit ac2a7ec

Please sign in to comment.