From 8a226b6ee7521eb1b02d0d60742a1a7f0eaa3277 Mon Sep 17 00:00:00 2001
From: Jiajie Chen <c@jia.je>
Date: Tue, 12 Dec 2023 19:57:11 +0800
Subject: [PATCH] Add vsub

---
 README.md                       | 6 ------
 code/gen_impl.py                | 8 ++++++++
 code/gen_tb.py                  | 2 ++
 code/vadd_b.h                   | 2 +-
 code/vsub_b.cpp                 | 9 +++++++++
 code/vsub_b.h                   | 3 +++
 code/vsub_d.cpp                 | 9 +++++++++
 code/vsub_d.h                   | 3 +++
 code/vsub_h.cpp                 | 9 +++++++++
 code/vsub_h.h                   | 3 +++
 code/vsub_q.cpp                 | 9 +++++++++
 code/vsub_q.h                   | 1 +
 code/vsub_w.cpp                 | 9 +++++++++
 code/vsub_w.h                   | 3 +++
 docs/lsx/integer_computation.md | 7 +++++++
 main.py                         | 8 ++++++++
 16 files changed, 84 insertions(+), 7 deletions(-)
 create mode 100644 code/vsub_b.cpp
 create mode 100644 code/vsub_b.h
 create mode 100644 code/vsub_d.cpp
 create mode 100644 code/vsub_d.h
 create mode 100644 code/vsub_h.cpp
 create mode 100644 code/vsub_h.h
 create mode 100644 code/vsub_q.cpp
 create mode 100644 code/vsub_q.h
 create mode 100644 code/vsub_w.cpp
 create mode 100644 code/vsub_w.h

diff --git a/README.md b/README.md
index daaf0f83..c8654c3a 100644
--- a/README.md
+++ b/README.md
@@ -22,10 +22,6 @@ Vector Store with Register Offset
 
 Vector Set Equal/Less than or Equal/Less Than
 
-### vsub.b/h/w/d
-
-Vector Subtract
-
 ### vsadd.b/h/w/d
 
 Vector Saturated Add
@@ -124,8 +120,6 @@ Vector Multiplication High
 
 ### vfrstp.b/h
 
-### vsub.q
-
 ### vsignconv.b/h/w/d
 
 ### vfsub.s/d
diff --git a/code/gen_impl.py b/code/gen_impl.py
index 815896ca..ff49850b 100644
--- a/code/gen_impl.py
+++ b/code/gen_impl.py
@@ -186,6 +186,14 @@
 for width in ["b", "h", "w", "d"]:
     w = widths[width]
     m = members[width]
+    for name, op in [("add", "+"), ("sub", "-")]:
+        with open(f"v{name}_{width}.h", "w") as f:
+            print(f"for (int i = 0;i < {128 // w};i++) {{", file=f)
+            print(
+                f"  dst.{m}[i] = a.{m}[i] {op} b.{m}[i];",
+                file=f,
+            )
+            print(f"}}", file=f)
     with open(f"vbitclr_{width}.h", "w") as f:
         print(f"for (int i = 0;i < {128 // w};i++) {{", file=f)
         print(
diff --git a/code/gen_tb.py b/code/gen_tb.py
index dae86f91..b97900db 100644
--- a/code/gen_tb.py
+++ b/code/gen_tb.py
@@ -31,6 +31,7 @@
 
 tb = {
     # widths, args, extra args for imm
+    "vadd": (widths_signed, "v128 a, v128 b"),
     "vavg": (widths_all, "v128 a, v128 b"),
     "vavgr": (widths_all, "v128 a, v128 b"),
     "vaddwev": (widths_vaddw, "v128 a, v128 b"),
@@ -66,6 +67,7 @@
     "vmini": (widths_all, "v128 a, int imm", [0, 3, 15]),
     "vmulwev": (widths_vaddw, "v128 a, v128 b"),
     "vmulwod": (widths_vaddw, "v128 a, v128 b"),
+    "vsub": (widths_signed, "v128 a, v128 b"),
     "vsubwev": (widths_vsubw, "v128 a, v128 b"),
     "vsubwod": (widths_vsubw, "v128 a, v128 b"),
 }
diff --git a/code/vadd_b.h b/code/vadd_b.h
index 0f8b82f0..877bbbf6 100644
--- a/code/vadd_b.h
+++ b/code/vadd_b.h
@@ -1,3 +1,3 @@
 for (int i = 0; i < 16; i++) {
   dst.byte[i] = a.byte[i] + b.byte[i];
-}
\ No newline at end of file
+}
diff --git a/code/vsub_b.cpp b/code/vsub_b.cpp
new file mode 100644
index 00000000..9eacccb0
--- /dev/null
+++ b/code/vsub_b.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vsub_b(v128 a, v128 b) {
+  v128 dst;
+#include "vsub_b.h"
+  return dst;
+}
+
+void test() { FUZZ2(vsub_b); }
diff --git a/code/vsub_b.h b/code/vsub_b.h
new file mode 100644
index 00000000..0fc73f9d
--- /dev/null
+++ b/code/vsub_b.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] - b.byte[i];
+}
diff --git a/code/vsub_d.cpp b/code/vsub_d.cpp
new file mode 100644
index 00000000..ae6cdec9
--- /dev/null
+++ b/code/vsub_d.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vsub_d(v128 a, v128 b) {
+  v128 dst;
+#include "vsub_d.h"
+  return dst;
+}
+
+void test() { FUZZ2(vsub_d); }
diff --git a/code/vsub_d.h b/code/vsub_d.h
new file mode 100644
index 00000000..17f9fb29
--- /dev/null
+++ b/code/vsub_d.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] - b.dword[i];
+}
diff --git a/code/vsub_h.cpp b/code/vsub_h.cpp
new file mode 100644
index 00000000..e352a7ca
--- /dev/null
+++ b/code/vsub_h.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vsub_h(v128 a, v128 b) {
+  v128 dst;
+#include "vsub_h.h"
+  return dst;
+}
+
+void test() { FUZZ2(vsub_h); }
diff --git a/code/vsub_h.h b/code/vsub_h.h
new file mode 100644
index 00000000..f815f49f
--- /dev/null
+++ b/code/vsub_h.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] - b.half[i];
+}
diff --git a/code/vsub_q.cpp b/code/vsub_q.cpp
new file mode 100644
index 00000000..40a27c60
--- /dev/null
+++ b/code/vsub_q.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vsub_q(v128 a, v128 b) {
+  v128 dst;
+#include "vsub_q.h"
+  return dst;
+}
+
+void test() { FUZZ2(vsub_q); }
diff --git a/code/vsub_q.h b/code/vsub_q.h
new file mode 100644
index 00000000..e3851e12
--- /dev/null
+++ b/code/vsub_q.h
@@ -0,0 +1 @@
+dst.qword[0] = a.qword[0] - b.qword[0];
diff --git a/code/vsub_w.cpp b/code/vsub_w.cpp
new file mode 100644
index 00000000..d4c2a80d
--- /dev/null
+++ b/code/vsub_w.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vsub_w(v128 a, v128 b) {
+  v128 dst;
+#include "vsub_w.h"
+  return dst;
+}
+
+void test() { FUZZ2(vsub_w); }
diff --git a/code/vsub_w.h b/code/vsub_w.h
new file mode 100644
index 00000000..686b871d
--- /dev/null
+++ b/code/vsub_w.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] - b.word[i];
+}
diff --git a/docs/lsx/integer_computation.md b/docs/lsx/integer_computation.md
index 1abf3442..1ec8e724 100644
--- a/docs/lsx/integer_computation.md
+++ b/docs/lsx/integer_computation.md
@@ -145,6 +145,13 @@
 {{ vmulwev('q', 'du') }}
 {{ vmulwev('q', 'du', 'd') }}
 
+{{ vsub('b') }}
+{{ vsub('h') }}
+{{ vsub('w') }}
+{{ vsub('d') }}
+{{ vsub('q') }}
+
+
 {{ vsubwev('h', 'b') }}
 {{ vsubwev('h', 'bu') }}
 {{ vsubwev('w', 'h') }}
diff --git a/main.py b/main.py
index 1560e318..07726325 100644
--- a/main.py
+++ b/main.py
@@ -447,3 +447,11 @@ def vldrepl(name):
             desc=f"Read {width}-bit data from memory address `addr + (offset << {shift})`, replicate the data to all vector lanes and save into `dst`.",
         )
 
+    @env.macro
+    def vsub(name):
+        width = widths[name]
+        return instruction(
+            intrinsic=f"__m128i __lsx_vsub_{name} (__m128i a, __m128i b)",
+            instr=f"vsub.{name} vr, vr, vr",
+            desc=f"Subtract {width}-bit elements in `a` and `b`, save the result in `dst`.",
+        )