From d66ff5bfdcd1c7f9fe6cbfc56113f8074630326f Mon Sep 17 00:00:00 2001 From: Jiajie Chen Date: Wed, 13 Dec 2023 23:20:06 +0800 Subject: [PATCH] Add vldi --- code/vldi.cpp | 44 +++++++++++++++++++++++ code/vldi.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ docs/lsx/misc.md | 4 ++- main.py | 28 +++++++++++++++ 4 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 code/vldi.cpp create mode 100644 code/vldi.h diff --git a/code/vldi.cpp b/code/vldi.cpp new file mode 100644 index 00000000..f557ff39 --- /dev/null +++ b/code/vldi.cpp @@ -0,0 +1,44 @@ +#include "common.h" + +v128 vldi(int imm) { + v128 dst; +#include "vldi.h" + return dst; +} + +void test() { + // 0b000 + FUZZ0(vldi, (0 << 10) | 0x12); + // 0b001 + FUZZ0(vldi, (1 << 10) | 0x12); + // 0b010 + FUZZ0(vldi, (2 << 10) | 0x12); + // 0b011 + FUZZ0(vldi, (3 << 10) | 0x12); + // 0b10000 + FUZZ0(vldi, -(((0b01111) << 8) | 0x12)); + // 0b10001 + FUZZ0(vldi, -(((0b01110) << 8) | 0x12)); + // 0b10010 + FUZZ0(vldi, -(((0b01101) << 8) | 0x12)); + // 0b10011 + FUZZ0(vldi, -(((0b01100) << 8) | 0x12)); + // 0b10100 + FUZZ0(vldi, -(((0b01011) << 8) | 0x12)); + // 0b10101 + FUZZ0(vldi, -(((0b01010) << 8) | 0x12)); + // 0b10110 + FUZZ0(vldi, -(((0b01001) << 8) | 0x12)); + // 0b10111 + FUZZ0(vldi, -(((0b01000) << 8) | 0x12)); + // 0b11000 + FUZZ0(vldi, -(((0b00111) << 8) | 0x12)); + // 0b11001 + FUZZ0(vldi, -(((0b00110) << 8) | 0x12)); + // 0b11010 + FUZZ0(vldi, -(((0b00101) << 8) | 0x12)); + // 0b11011 + FUZZ0(vldi, -(((0b00100) << 8) | 0x12)); + // 0b11100 + FUZZ0(vldi, -(((0b00011) << 8) | 0x12)); +} diff --git a/code/vldi.h b/code/vldi.h new file mode 100644 index 00000000..c2923cab --- /dev/null +++ b/code/vldi.h @@ -0,0 +1,93 @@ +u64 imm12_10 = (imm >> 10) & 0b111; +u64 imm12_8 = (imm >> 8) & 0b11111; +u64 imm9_0 = imm & 0x3FF; +s64 simm9_0 = ((s64)imm9_0 << 54) >> 54; +u64 imm7_0 = imm & 0xFF; +u64 imm7 = (imm >> 7) & 0x1; +u64 imm6 = (imm >> 6) & 0x1; +u64 imm5 = (imm >> 5) & 0x1; +u64 imm5_0 = imm & 0x3F; +u64 imm4 = (imm >> 4) & 0x1; +u64 imm3 = (imm >> 3) & 0x1; +u64 imm2 = (imm >> 2) & 0x1; +u64 imm1 = (imm >> 1) & 0x1; +u64 imm0 = imm & 0x1; + +u64 broadcast_value; +u64 broadcast_width; +if (imm12_10 == 0b000) { + broadcast_value = imm7_0; + broadcast_width = 8; +} else if (imm12_10 == 0b001) { + broadcast_value = simm9_0; + broadcast_width = 16; +} else if (imm12_10 == 0b010) { + broadcast_value = simm9_0; + broadcast_width = 32; +} else if (imm12_10 == 0b011) { + broadcast_value = simm9_0; + broadcast_width = 64; +} else if (imm12_8 == 0b10000) { + broadcast_value = imm7_0; + broadcast_width = 32; +} else if (imm12_8 == 0b10001) { + broadcast_value = imm7_0 << 8; + broadcast_width = 32; +} else if (imm12_8 == 0b10010) { + broadcast_value = imm7_0 << 16; + broadcast_width = 32; +} else if (imm12_8 == 0b10011) { + broadcast_value = imm7_0 << 24; + broadcast_width = 32; +} else if (imm12_8 == 0b10100) { + broadcast_value = imm7_0; + broadcast_width = 16; +} else if (imm12_8 == 0b10101) { + broadcast_value = imm7_0 << 8; + broadcast_width = 16; +} else if (imm12_8 == 0b10110) { + broadcast_value = (imm7_0 << 8) | 0xFF; + broadcast_width = 32; +} else if (imm12_8 == 0b10111) { + broadcast_value = (imm7_0 << 16) | 0xFFFF; + broadcast_width = 32; +} else if (imm12_8 == 0b11000) { + broadcast_value = imm7_0; + broadcast_width = 8; +} else if (imm12_8 == 0b11001) { + broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 + + imm3 * 0xFF000000 + imm4 * 0xFF00000000 + + imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 + + imm7 * 0xFF00000000000000; + broadcast_width = 64; +} else if (imm12_8 == 0b11010) { + broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) | + (imm5_0 << 19); + broadcast_width = 32; +} else if (imm12_8 == 0b11011) { + broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) | + (imm5_0 << 19); + broadcast_width = 64; +} else if (imm12_8 == 0b11100) { + broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) | + (imm5_0 << 48); + broadcast_width = 64; +} + +if (broadcast_width == 8) { + for (int i = 0; i < 16; i++) { + dst.byte[i] = broadcast_value; + } +} else if (broadcast_width == 16) { + for (int i = 0; i < 8; i++) { + dst.half[i] = broadcast_value; + } +} else if (broadcast_width == 32) { + for (int i = 0; i < 4; i++) { + dst.word[i] = broadcast_value; + } +} else if (broadcast_width == 64) { + for (int i = 0; i < 2; i++) { + dst.dword[i] = broadcast_value; + } +} \ No newline at end of file diff --git a/docs/lsx/misc.md b/docs/lsx/misc.md index c3ba9275..dbbe3e87 100644 --- a/docs/lsx/misc.md +++ b/docs/lsx/misc.md @@ -91,4 +91,6 @@ {{ vsigncov('b') }} {{ vsigncov('h') }} {{ vsigncov('w') }} -{{ vsigncov('d') }} \ No newline at end of file +{{ vsigncov('d') }} + +{{ vldi() }} \ No newline at end of file diff --git a/main.py b/main.py index aa6b768d..d9cf5c51 100644 --- a/main.py +++ b/main.py @@ -1588,6 +1588,34 @@ def vfnmsub_d(): desc=f"Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.", ) + @env.macro + def vldi(): + return instruction( + intrinsic=f"__m128i __lsx_vldi (imm_n1024_1023 imm)", + instr=f"vldi vr, imm", + desc=f""" +Initialize `dst` using predefined patterns: + +- `imm[12:10]=0b000`: broadcast `imm[7:0]` as 8-bit elements to all lanes +- `imm[12:10]=0b001`: broadcast sign-extended `imm[9:0]` as 16-bit elements to all lanes +- `imm[12:10]=0b010`: broadcast sign-extended `imm[9:0]` as 32-bit elements to all lanes +- `imm[12:10]=0b011`: broadcast sign-extended `imm[9:0]` as 64-bit elements to all lanes +- `imm[12:8]=0b10000`: broadcast `imm[7:0]` as 32-bit elements to all lanes +- `imm[12:8]=0b10001`: broadcast `imm[7:0] << 8` as 32-bit elements to all lanes +- `imm[12:8]=0b10010`: broadcast `imm[7:0] << 16` as 32-bit elements to all lanes +- `imm[12:8]=0b10011`: broadcast `imm[7:0] << 24` as 32-bit elements to all lanes +- `imm[12:8]=0b10100`: broadcast `imm[7:0]` as 16-bit elements to all lanes +- `imm[12:8]=0b10101`: broadcast `imm[7:0] << 8` as 16-bit elements to all lanes +- `imm[12:8]=0b10110`: broadcast `(imm[7:0] << 8) | 0xFF` as 32-bit elements to all lanes +- `imm[12:8]=0b10111`: broadcast `(imm[7:0] << 16) | 0xFFFF` as 32-bit elements to all lanes +- `imm[12:8]=0b11000`: broadcast `imm[7:0]` as 8-bit elements to all lanes +- `imm[12:8]=0b11001`: repeat each bit of `imm[7:0]` eight times, and broadcast the result as 64-bit elements to all lanes +- `imm[12:8]=0b11010`: broadcast `(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)` as 32-bit elements to all lanes +- `imm[12:8]=0b11011`: broadcast `(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)` as 64-bit elements to all lanes +- `imm[12:8]=0b11100`: broadcast `(imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)` as 64-bit elements to all lanes +""", + ) + @env.macro def all_intrinsics(): result = []