From a2f61faa23dab171c02a2a817a9de42158f7d3f7 Mon Sep 17 00:00:00 2001
From: Jiajie Chen <c@jia.je>
Date: Tue, 12 Dec 2023 21:37:28 +0800
Subject: [PATCH] Add vmuh

---
 README.md                       |  4 ----
 code/gen_impl.py                |  7 +++++++
 code/gen_tb.py                  |  1 +
 code/vmuh_b.cpp                 |  9 +++++++++
 code/vmuh_b.h                   |  3 +++
 code/vmuh_bu.cpp                |  9 +++++++++
 code/vmuh_bu.h                  |  3 +++
 code/vmuh_d.cpp                 |  9 +++++++++
 code/vmuh_d.h                   |  3 +++
 code/vmuh_du.cpp                |  9 +++++++++
 code/vmuh_du.h                  |  3 +++
 code/vmuh_h.cpp                 |  9 +++++++++
 code/vmuh_h.h                   |  3 +++
 code/vmuh_hu.cpp                |  9 +++++++++
 code/vmuh_hu.h                  |  3 +++
 code/vmuh_w.cpp                 |  9 +++++++++
 code/vmuh_w.h                   |  3 +++
 code/vmuh_wu.cpp                |  9 +++++++++
 code/vmuh_wu.h                  |  3 +++
 docs/lsx/integer_computation.md |  9 +++++++++
 main.py                         | 10 ++++++++++
 21 files changed, 123 insertions(+), 4 deletions(-)
 create mode 100644 code/vmuh_b.cpp
 create mode 100644 code/vmuh_b.h
 create mode 100644 code/vmuh_bu.cpp
 create mode 100644 code/vmuh_bu.h
 create mode 100644 code/vmuh_d.cpp
 create mode 100644 code/vmuh_d.h
 create mode 100644 code/vmuh_du.cpp
 create mode 100644 code/vmuh_du.h
 create mode 100644 code/vmuh_h.cpp
 create mode 100644 code/vmuh_h.h
 create mode 100644 code/vmuh_hu.cpp
 create mode 100644 code/vmuh_hu.h
 create mode 100644 code/vmuh_w.cpp
 create mode 100644 code/vmuh_w.h
 create mode 100644 code/vmuh_wu.cpp
 create mode 100644 code/vmuh_wu.h

diff --git a/README.md b/README.md
index 746e929f..9e5a69ed 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,6 @@ Arranged from QEMU implementation and [GCC Intrinsics](https://gcc.gnu.org/onlin
 
 TODO List:
 
-### vmuh.b/h/w/d/bu/hu/wu/du
-
-Vector Multiplication High
-
 ### vmod.b/h/w/d
 
 ### vmod.bu/hu/wu/du
diff --git a/code/gen_impl.py b/code/gen_impl.py
index 9dfa438c..235440c0 100644
--- a/code/gen_impl.py
+++ b/code/gen_impl.py
@@ -159,6 +159,13 @@
                 file=f,
             )
             print(f"}}", file=f)
+    with open(f"vmuh_{width}.h", "w") as f:
+        print(f"for (int i = 0;i < {128 // w};i++) {{", file=f)
+        print(
+            f"  dst.{m}[i] = ((({sign}{w * 2})({sign}{w})a.{m}[i] * ({sign}{w * 2})({sign}{w})b.{m}[i])) >> {w};",
+            file=f,
+        )
+        print(f"}}", file=f)
 
 for width in ["b", "bu", "h", "hu", "w", "wu", "d", "du"]:
     double_width = double_widths[width]
diff --git a/code/gen_tb.py b/code/gen_tb.py
index ea89cc05..7f76d09f 100644
--- a/code/gen_tb.py
+++ b/code/gen_tb.py
@@ -69,6 +69,7 @@
     "vmin": (widths_all, "v128 a, v128 b"),
     "vmini": (widths_all, "v128 a, int imm", [0, 3, 15]),
     "vmsub": (widths_signed, "v128 a, v128 b, v128 c"),
+    "vmuh": (widths_all, "v128 a, v128 b"),
     "vmul": (widths_signed, "v128 a, v128 b"),
     "vmulwev": (widths_vaddw, "v128 a, v128 b"),
     "vmulwod": (widths_vaddw, "v128 a, v128 b"),
diff --git a/code/vmuh_b.cpp b/code/vmuh_b.cpp
new file mode 100644
index 00000000..982d9b1c
--- /dev/null
+++ b/code/vmuh_b.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_b(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_b.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_b); }
diff --git a/code/vmuh_b.h b/code/vmuh_b.h
new file mode 100644
index 00000000..12822357
--- /dev/null
+++ b/code/vmuh_b.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
diff --git a/code/vmuh_bu.cpp b/code/vmuh_bu.cpp
new file mode 100644
index 00000000..08da45b7
--- /dev/null
+++ b/code/vmuh_bu.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_bu(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_bu.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_bu); }
diff --git a/code/vmuh_bu.h b/code/vmuh_bu.h
new file mode 100644
index 00000000..323b1453
--- /dev/null
+++ b/code/vmuh_bu.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
diff --git a/code/vmuh_d.cpp b/code/vmuh_d.cpp
new file mode 100644
index 00000000..82d3aa2d
--- /dev/null
+++ b/code/vmuh_d.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_d(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_d.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_d); }
diff --git a/code/vmuh_d.h b/code/vmuh_d.h
new file mode 100644
index 00000000..20a405ac
--- /dev/null
+++ b/code/vmuh_d.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
diff --git a/code/vmuh_du.cpp b/code/vmuh_du.cpp
new file mode 100644
index 00000000..64cd143a
--- /dev/null
+++ b/code/vmuh_du.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_du(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_du.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_du); }
diff --git a/code/vmuh_du.h b/code/vmuh_du.h
new file mode 100644
index 00000000..0553bc55
--- /dev/null
+++ b/code/vmuh_du.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
diff --git a/code/vmuh_h.cpp b/code/vmuh_h.cpp
new file mode 100644
index 00000000..e4b09a51
--- /dev/null
+++ b/code/vmuh_h.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_h(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_h.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_h); }
diff --git a/code/vmuh_h.h b/code/vmuh_h.h
new file mode 100644
index 00000000..7138dfc7
--- /dev/null
+++ b/code/vmuh_h.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 8; i++) {
+  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
diff --git a/code/vmuh_hu.cpp b/code/vmuh_hu.cpp
new file mode 100644
index 00000000..b49c397b
--- /dev/null
+++ b/code/vmuh_hu.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_hu(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_hu.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_hu); }
diff --git a/code/vmuh_hu.h b/code/vmuh_hu.h
new file mode 100644
index 00000000..2df2e6a8
--- /dev/null
+++ b/code/vmuh_hu.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 8; i++) {
+  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
diff --git a/code/vmuh_w.cpp b/code/vmuh_w.cpp
new file mode 100644
index 00000000..7c63eb6a
--- /dev/null
+++ b/code/vmuh_w.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_w(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_w.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_w); }
diff --git a/code/vmuh_w.h b/code/vmuh_w.h
new file mode 100644
index 00000000..903f13cf
--- /dev/null
+++ b/code/vmuh_w.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 4; i++) {
+  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
diff --git a/code/vmuh_wu.cpp b/code/vmuh_wu.cpp
new file mode 100644
index 00000000..556498ff
--- /dev/null
+++ b/code/vmuh_wu.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vmuh_wu(v128 a, v128 b) {
+  v128 dst;
+#include "vmuh_wu.h"
+  return dst;
+}
+
+void test() { FUZZ2(vmuh_wu); }
diff --git a/code/vmuh_wu.h b/code/vmuh_wu.h
new file mode 100644
index 00000000..99019aa8
--- /dev/null
+++ b/code/vmuh_wu.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 4; i++) {
+  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
diff --git a/docs/lsx/integer_computation.md b/docs/lsx/integer_computation.md
index a498d70a..63a4cf7e 100644
--- a/docs/lsx/integer_computation.md
+++ b/docs/lsx/integer_computation.md
@@ -168,6 +168,15 @@
 {{ vmsub('w') }}
 {{ vmsub('d') }}
 
+{{ vmuh('b') }}
+{{ vmuh('bu') }}
+{{ vmuh('h') }}
+{{ vmuh('hu') }}
+{{ vmuh('w') }}
+{{ vmuh('wu') }}
+{{ vmuh('d') }}
+{{ vmuh('du') }}
+
 {{ vmul('b') }}
 {{ vmul('h') }}
 {{ vmul('w') }}
diff --git a/main.py b/main.py
index 06da595e..f636506a 100644
--- a/main.py
+++ b/main.py
@@ -517,6 +517,16 @@ def vmul(name):
             desc=f"Multiply {width}-bit elements in `a` and `b`, save the result in `dst`.",
         )
 
+    @env.macro
+    def vmuh(name):
+        width = widths[name]
+        signedness = signednesses[name]
+        return instruction(
+            intrinsic=f"__m128i __lsx_vmuh_{name} (__m128i a, __m128i b)",
+            instr=f"vmuh.{name} vr, vr, vr",
+            desc=f"Multiply {signedness} {width}-bit elements in `a` and `b`, save the high {width}-bit result in `dst`.",
+        )
+
     @env.macro
     def vmsub(name):
         width = widths[name]