From d8ce748981e7ae29c22f0ad4cedfbac07edba8e4 Mon Sep 17 00:00:00 2001 From: Jiajie Chen Date: Tue, 12 Dec 2023 21:30:43 +0800 Subject: [PATCH] Add vsadd/vssub --- README.md | 16 ---------------- code/common.h | 26 ++++++++++++++++++++++++++ code/gen_impl.py | 8 ++++++++ code/gen_tb.py | 2 ++ code/vsadd_b.cpp | 9 +++++++++ code/vsadd_b.h | 3 +++ code/vsadd_bu.cpp | 9 +++++++++ code/vsadd_bu.h | 3 +++ code/vsadd_d.cpp | 9 +++++++++ code/vsadd_d.h | 3 +++ code/vsadd_du.cpp | 9 +++++++++ code/vsadd_du.h | 3 +++ code/vsadd_h.cpp | 9 +++++++++ code/vsadd_h.h | 3 +++ code/vsadd_hu.cpp | 9 +++++++++ code/vsadd_hu.h | 3 +++ code/vsadd_w.cpp | 9 +++++++++ code/vsadd_w.h | 3 +++ code/vsadd_wu.cpp | 9 +++++++++ code/vsadd_wu.h | 3 +++ code/vssub_b.cpp | 9 +++++++++ code/vssub_b.h | 3 +++ code/vssub_bu.cpp | 9 +++++++++ code/vssub_bu.h | 3 +++ code/vssub_d.cpp | 9 +++++++++ code/vssub_d.h | 3 +++ code/vssub_du.cpp | 9 +++++++++ code/vssub_du.h | 3 +++ code/vssub_h.cpp | 9 +++++++++ code/vssub_h.h | 3 +++ code/vssub_hu.cpp | 9 +++++++++ code/vssub_hu.h | 3 +++ code/vssub_w.cpp | 9 +++++++++ code/vssub_w.h | 3 +++ code/vssub_wu.cpp | 9 +++++++++ code/vssub_wu.h | 3 +++ docs/lsx/integer_computation.md | 10 ++++++++++ main.py | 26 +++++++++++++++++++++++--- 38 files changed, 261 insertions(+), 19 deletions(-) create mode 100644 code/vsadd_b.cpp create mode 100644 code/vsadd_b.h create mode 100644 code/vsadd_bu.cpp create mode 100644 code/vsadd_bu.h create mode 100644 code/vsadd_d.cpp create mode 100644 code/vsadd_d.h create mode 100644 code/vsadd_du.cpp create mode 100644 code/vsadd_du.h create mode 100644 code/vsadd_h.cpp create mode 100644 code/vsadd_h.h create mode 100644 code/vsadd_hu.cpp create mode 100644 code/vsadd_hu.h create mode 100644 code/vsadd_w.cpp create mode 100644 code/vsadd_w.h create mode 100644 code/vsadd_wu.cpp create mode 100644 code/vsadd_wu.h create mode 100644 code/vssub_b.cpp create mode 100644 code/vssub_b.h create mode 100644 code/vssub_bu.cpp create mode 100644 code/vssub_bu.h create mode 100644 code/vssub_d.cpp create mode 100644 code/vssub_d.h create mode 100644 code/vssub_du.cpp create mode 100644 code/vssub_du.h create mode 100644 code/vssub_h.cpp create mode 100644 code/vssub_h.h create mode 100644 code/vssub_hu.cpp create mode 100644 code/vssub_hu.h create mode 100644 code/vssub_w.cpp create mode 100644 code/vssub_w.h create mode 100644 code/vssub_wu.cpp create mode 100644 code/vssub_wu.h diff --git a/README.md b/README.md index 8135cc34..746e929f 100644 --- a/README.md +++ b/README.md @@ -6,22 +6,6 @@ Arranged from QEMU implementation and [GCC Intrinsics](https://gcc.gnu.org/onlin TODO List: -### vsadd.b/h/w/d - -Vector Saturated Add - -### vssub.b/h/w/d - -Vector Saturated Subtract - -### vsadd.bu/hu/wu/du - -Vector Saturated Add Unsigned - -### vssub.bu/hu/wu/du - -Vector Saturated Subtract Unsigned - ### vmuh.b/h/w/d/bu/hu/wu/du Vector Multiplication High diff --git a/code/common.h b/code/common.h index 6090f758..99fa18c9 100644 --- a/code/common.h +++ b/code/common.h @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -49,6 +50,31 @@ template u8 popcount(T num) { return res; } +// saturating add/sub +// Adapted from: +// https://codereview.stackexchange.com/questions/179172/c17-saturating-integer-arithmetic-type-library +template T sadd(T a, T b) { + T result; + if (b < 0) { + return __builtin_add_overflow(a, b, &result) ? std::numeric_limits::min() + : result; + } else { + return __builtin_add_overflow(a, b, &result) ? std::numeric_limits::max() + : result; + } +} + +template T ssub(T a, T b) { + T result; + if (b < 0) { + return __builtin_sub_overflow(a, b, &result) ? std::numeric_limits::max() + : result; + } else { + return __builtin_sub_overflow(a, b, &result) ? std::numeric_limits::min() + : result; + } +} + using std::max; using std::min; diff --git a/code/gen_impl.py b/code/gen_impl.py index 856821c3..9dfa438c 100644 --- a/code/gen_impl.py +++ b/code/gen_impl.py @@ -151,6 +151,14 @@ file=f, ) print(f"}}", file=f) + for op in ["sadd", "ssub"]: + with open(f"v{op}_{width}.h", "w") as f: + print(f"for (int i = 0;i < {128 // w};i++) {{", file=f) + print( + f" dst.{m}[i] = ({sign}{w}){op}(({sign}{w})a.{m}[i], ({sign}{w})b.{m}[i]);", + file=f, + ) + print(f"}}", file=f) for width in ["b", "bu", "h", "hu", "w", "wu", "d", "du"]: double_width = double_widths[width] diff --git a/code/gen_tb.py b/code/gen_tb.py index 9b7087c4..ea89cc05 100644 --- a/code/gen_tb.py +++ b/code/gen_tb.py @@ -73,6 +73,8 @@ "vmulwev": (widths_vaddw, "v128 a, v128 b"), "vmulwod": (widths_vaddw, "v128 a, v128 b"), "vpcnt": (widths_signed, "v128 a"), + "vsadd": (widths_all, "v128 a, v128 b"), + "vssub": (widths_all, "v128 a, v128 b"), "vseq": (widths_signed, "v128 a, v128 b"), "vslt": (widths_all, "v128 a, v128 b"), "vsle": (widths_all, "v128 a, v128 b"), diff --git a/code/vsadd_b.cpp b/code/vsadd_b.cpp new file mode 100644 index 00000000..be4dc221 --- /dev/null +++ b/code/vsadd_b.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_b(v128 a, v128 b) { + v128 dst; +#include "vsadd_b.h" + return dst; +} + +void test() { FUZZ2(vsadd_b); } diff --git a/code/vsadd_b.h b/code/vsadd_b.h new file mode 100644 index 00000000..85cecd75 --- /dev/null +++ b/code/vsadd_b.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 16; i++) { + dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]); +} diff --git a/code/vsadd_bu.cpp b/code/vsadd_bu.cpp new file mode 100644 index 00000000..d021d3f7 --- /dev/null +++ b/code/vsadd_bu.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_bu(v128 a, v128 b) { + v128 dst; +#include "vsadd_bu.h" + return dst; +} + +void test() { FUZZ2(vsadd_bu); } diff --git a/code/vsadd_bu.h b/code/vsadd_bu.h new file mode 100644 index 00000000..c40c7b6c --- /dev/null +++ b/code/vsadd_bu.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 16; i++) { + dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]); +} diff --git a/code/vsadd_d.cpp b/code/vsadd_d.cpp new file mode 100644 index 00000000..96357c98 --- /dev/null +++ b/code/vsadd_d.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_d(v128 a, v128 b) { + v128 dst; +#include "vsadd_d.h" + return dst; +} + +void test() { FUZZ2(vsadd_d); } diff --git a/code/vsadd_d.h b/code/vsadd_d.h new file mode 100644 index 00000000..ca725258 --- /dev/null +++ b/code/vsadd_d.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 2; i++) { + dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]); +} diff --git a/code/vsadd_du.cpp b/code/vsadd_du.cpp new file mode 100644 index 00000000..a5f87d58 --- /dev/null +++ b/code/vsadd_du.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_du(v128 a, v128 b) { + v128 dst; +#include "vsadd_du.h" + return dst; +} + +void test() { FUZZ2(vsadd_du); } diff --git a/code/vsadd_du.h b/code/vsadd_du.h new file mode 100644 index 00000000..361d520b --- /dev/null +++ b/code/vsadd_du.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 2; i++) { + dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]); +} diff --git a/code/vsadd_h.cpp b/code/vsadd_h.cpp new file mode 100644 index 00000000..65694213 --- /dev/null +++ b/code/vsadd_h.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_h(v128 a, v128 b) { + v128 dst; +#include "vsadd_h.h" + return dst; +} + +void test() { FUZZ2(vsadd_h); } diff --git a/code/vsadd_h.h b/code/vsadd_h.h new file mode 100644 index 00000000..48b27a2f --- /dev/null +++ b/code/vsadd_h.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 8; i++) { + dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]); +} diff --git a/code/vsadd_hu.cpp b/code/vsadd_hu.cpp new file mode 100644 index 00000000..2b480457 --- /dev/null +++ b/code/vsadd_hu.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_hu(v128 a, v128 b) { + v128 dst; +#include "vsadd_hu.h" + return dst; +} + +void test() { FUZZ2(vsadd_hu); } diff --git a/code/vsadd_hu.h b/code/vsadd_hu.h new file mode 100644 index 00000000..7a3278c0 --- /dev/null +++ b/code/vsadd_hu.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 8; i++) { + dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]); +} diff --git a/code/vsadd_w.cpp b/code/vsadd_w.cpp new file mode 100644 index 00000000..470caf1a --- /dev/null +++ b/code/vsadd_w.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_w(v128 a, v128 b) { + v128 dst; +#include "vsadd_w.h" + return dst; +} + +void test() { FUZZ2(vsadd_w); } diff --git a/code/vsadd_w.h b/code/vsadd_w.h new file mode 100644 index 00000000..c4f0787e --- /dev/null +++ b/code/vsadd_w.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 4; i++) { + dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]); +} diff --git a/code/vsadd_wu.cpp b/code/vsadd_wu.cpp new file mode 100644 index 00000000..a140070f --- /dev/null +++ b/code/vsadd_wu.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vsadd_wu(v128 a, v128 b) { + v128 dst; +#include "vsadd_wu.h" + return dst; +} + +void test() { FUZZ2(vsadd_wu); } diff --git a/code/vsadd_wu.h b/code/vsadd_wu.h new file mode 100644 index 00000000..1cfd5bbb --- /dev/null +++ b/code/vsadd_wu.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 4; i++) { + dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]); +} diff --git a/code/vssub_b.cpp b/code/vssub_b.cpp new file mode 100644 index 00000000..845bfa3d --- /dev/null +++ b/code/vssub_b.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_b(v128 a, v128 b) { + v128 dst; +#include "vssub_b.h" + return dst; +} + +void test() { FUZZ2(vssub_b); } diff --git a/code/vssub_b.h b/code/vssub_b.h new file mode 100644 index 00000000..48c7dbc8 --- /dev/null +++ b/code/vssub_b.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 16; i++) { + dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]); +} diff --git a/code/vssub_bu.cpp b/code/vssub_bu.cpp new file mode 100644 index 00000000..85c14172 --- /dev/null +++ b/code/vssub_bu.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_bu(v128 a, v128 b) { + v128 dst; +#include "vssub_bu.h" + return dst; +} + +void test() { FUZZ2(vssub_bu); } diff --git a/code/vssub_bu.h b/code/vssub_bu.h new file mode 100644 index 00000000..6b48fdef --- /dev/null +++ b/code/vssub_bu.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 16; i++) { + dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]); +} diff --git a/code/vssub_d.cpp b/code/vssub_d.cpp new file mode 100644 index 00000000..abd074a4 --- /dev/null +++ b/code/vssub_d.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_d(v128 a, v128 b) { + v128 dst; +#include "vssub_d.h" + return dst; +} + +void test() { FUZZ2(vssub_d); } diff --git a/code/vssub_d.h b/code/vssub_d.h new file mode 100644 index 00000000..5a2d0615 --- /dev/null +++ b/code/vssub_d.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 2; i++) { + dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]); +} diff --git a/code/vssub_du.cpp b/code/vssub_du.cpp new file mode 100644 index 00000000..f919092f --- /dev/null +++ b/code/vssub_du.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_du(v128 a, v128 b) { + v128 dst; +#include "vssub_du.h" + return dst; +} + +void test() { FUZZ2(vssub_du); } diff --git a/code/vssub_du.h b/code/vssub_du.h new file mode 100644 index 00000000..343d0812 --- /dev/null +++ b/code/vssub_du.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 2; i++) { + dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]); +} diff --git a/code/vssub_h.cpp b/code/vssub_h.cpp new file mode 100644 index 00000000..7317c151 --- /dev/null +++ b/code/vssub_h.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_h(v128 a, v128 b) { + v128 dst; +#include "vssub_h.h" + return dst; +} + +void test() { FUZZ2(vssub_h); } diff --git a/code/vssub_h.h b/code/vssub_h.h new file mode 100644 index 00000000..c61130c0 --- /dev/null +++ b/code/vssub_h.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 8; i++) { + dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]); +} diff --git a/code/vssub_hu.cpp b/code/vssub_hu.cpp new file mode 100644 index 00000000..bb92162a --- /dev/null +++ b/code/vssub_hu.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_hu(v128 a, v128 b) { + v128 dst; +#include "vssub_hu.h" + return dst; +} + +void test() { FUZZ2(vssub_hu); } diff --git a/code/vssub_hu.h b/code/vssub_hu.h new file mode 100644 index 00000000..ed730cfe --- /dev/null +++ b/code/vssub_hu.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 8; i++) { + dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]); +} diff --git a/code/vssub_w.cpp b/code/vssub_w.cpp new file mode 100644 index 00000000..2d89488f --- /dev/null +++ b/code/vssub_w.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_w(v128 a, v128 b) { + v128 dst; +#include "vssub_w.h" + return dst; +} + +void test() { FUZZ2(vssub_w); } diff --git a/code/vssub_w.h b/code/vssub_w.h new file mode 100644 index 00000000..8bd9ed43 --- /dev/null +++ b/code/vssub_w.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 4; i++) { + dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]); +} diff --git a/code/vssub_wu.cpp b/code/vssub_wu.cpp new file mode 100644 index 00000000..ba8c1bc9 --- /dev/null +++ b/code/vssub_wu.cpp @@ -0,0 +1,9 @@ +#include "common.h" + +v128 vssub_wu(v128 a, v128 b) { + v128 dst; +#include "vssub_wu.h" + return dst; +} + +void test() { FUZZ2(vssub_wu); } diff --git a/code/vssub_wu.h b/code/vssub_wu.h new file mode 100644 index 00000000..b1b46bc5 --- /dev/null +++ b/code/vssub_wu.h @@ -0,0 +1,3 @@ +for (int i = 0; i < 4; i++) { + dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]); +} diff --git a/docs/lsx/integer_computation.md b/docs/lsx/integer_computation.md index 86ecfd56..a498d70a 100644 --- a/docs/lsx/integer_computation.md +++ b/docs/lsx/integer_computation.md @@ -186,6 +186,16 @@ {{ vmulwev('q', 'du') }} {{ vmulwev('q', 'du', 'd') }} +{{ vsadd('b') }} +{{ vsadd('h') }} +{{ vsadd('w') }} +{{ vsadd('d') }} + +{{ vssub('b') }} +{{ vssub('h') }} +{{ vssub('w') }} +{{ vssub('d') }} + {{ vsub('b') }} {{ vsub('h') }} {{ vsub('w') }} diff --git a/main.py b/main.py index b22efb2a..06da595e 100644 --- a/main.py +++ b/main.py @@ -559,7 +559,7 @@ def vseq(name): width = widths[name] return instruction( intrinsic=f"__m128i __lsx_vseq_{name} (__m128i a, __m128i b)", - instr=f"vseq.{name} vr, vr", + instr=f"vseq.{name} vr, vr, vr", desc=f"Compare the {width}-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.", ) @@ -569,7 +569,7 @@ def vslt(name): signedness = signednesses[name] return instruction( intrinsic=f"__m128i __lsx_vslt_{name} (__m128i a, __m128i b)", - instr=f"vslt.{name} vr, vr", + instr=f"vslt.{name} vr, vr, vr", desc=f"Compare the {signedness} {width}-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.", ) @@ -579,6 +579,26 @@ def vsle(name): signedness = signednesses[name] return instruction( intrinsic=f"__m128i __lsx_vslt_{name} (__m128i a, __m128i b)", - instr=f"vslt.{name} vr, vr", + instr=f"vslt.{name} vr, vr, vr", desc=f"Compare the {signedness} {width}-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.", + ) + + @env.macro + def vsadd(name): + width = widths[name] + signedness = signednesses[name] + return instruction( + intrinsic=f"__m128i __lsx_vsadd_{name} (__m128i a, __m128i b)", + instr=f"vsadd.{name} vr, vr, vr", + desc=f"Saturing add the {signedness} {width}-bit elements in `a` and `b`, store the result to `dst`.", + ) + + @env.macro + def vssub(name): + width = widths[name] + signedness = signednesses[name] + return instruction( + intrinsic=f"__m128i __lsx_vsadd_{name} (__m128i a, __m128i b)", + instr=f"vsadd.{name} vr, vr, vr", + desc=f"Saturing add the {signedness} {width}-bit elements in `a` and `b`, store the result to `dst`.", ) \ No newline at end of file