From f7112a72b73c77c3e341bda426560ae547a51117 Mon Sep 17 00:00:00 2001
From: Jiajie Chen <c@jia.je>
Date: Tue, 12 Dec 2023 20:50:25 +0800
Subject: [PATCH] Add vpcnt

---
 README.md                      |  8 +-------
 code/common.h                  | 10 ++++++++++
 code/gen_impl.py               |  7 +++++++
 code/gen_tb.py                 |  1 +
 code/vpcnt_b.cpp               |  9 +++++++++
 code/vpcnt_b.h                 |  3 +++
 code/vpcnt_d.cpp               |  9 +++++++++
 code/vpcnt_d.h                 |  3 +++
 code/vpcnt_h.cpp               |  9 +++++++++
 code/vpcnt_h.h                 |  3 +++
 code/vpcnt_w.cpp               |  9 +++++++++
 code/vpcnt_w.h                 |  3 +++
 docs/lsx/bitwise_operations.md |  5 +++++
 main.py                        |  9 +++++++++
 14 files changed, 81 insertions(+), 7 deletions(-)
 create mode 100644 code/vpcnt_b.cpp
 create mode 100644 code/vpcnt_b.h
 create mode 100644 code/vpcnt_d.cpp
 create mode 100644 code/vpcnt_d.h
 create mode 100644 code/vpcnt_h.cpp
 create mode 100644 code/vpcnt_h.h
 create mode 100644 code/vpcnt_w.cpp
 create mode 100644 code/vpcnt_w.h
diff --git a/README.md b/README.md
index 604f9998..66365b40 100644
--- a/README.md
+++ b/README.md
@@ -132,12 +132,6 @@ Vector Multiplication High
 
 ### vfrstpi.b/h
 
-### vclo.b/h/w/d
-
-### vclz.b/h/w/d
-
-### vpcnt.b/h/w/d
-
 ### vneg.b/h/w/d
 
 ### vmskltz.b/h/w/d
@@ -248,7 +242,7 @@ Vector Multiplication High
 
 ### vssrlni.b.h/h.w/w.d/d.q/bu.h/hu.w/wu.d/du.q
 
-### vssrlrni.b.h/h.w/w.d/d.q/bu.h/bhu.w/wu.d/du.q
+### vssrlrni.b.h/h.w/w.d/d.q/bu.h/hu.w/wu.d/du.q
 
 ### vsrani.b.h/h.w/w.d/d.q
 
diff --git a/code/common.h b/code/common.h
index e3c5e0c9..6090f758 100644
--- a/code/common.h
+++ b/code/common.h
@@ -39,6 +39,16 @@ template <typename T> u8 clz(T num) {
   return sizeof(T) * 8;
 }
 
+template <typename T> u8 popcount(T num) {
+  u8 res = 0;
+  for (int i = sizeof(T) * 8 - 1; i >= 0; i--) {
+    if ((num & ((T)1 << i)) != 0) {
+      res++;
+    }
+  }
+  return res;
+}
+
 using std::max;
 using std::min;
 
diff --git a/code/gen_impl.py b/code/gen_impl.py
index 0e8a61a8..503522c7 100644
--- a/code/gen_impl.py
+++ b/code/gen_impl.py
@@ -278,6 +278,13 @@
             file=f,
         )
         print(f"}}", file=f)
+    with open(f"vpcnt_{width}.h", "w") as f:
+        print(f"for (int i = 0;i < {128 // w};i++) {{", file=f)
+        print(
+            f"  dst.{m}[i] = popcount(a.{m}[i]);",
+            file=f,
+        )
+        print(f"}}", file=f)
     with open(f"vextrins_{width}.h", "w") as f:
         print(f"for (int i = 0;i < {128 // w};i++) {{", file=f)
         mask = 128 // w - 1
diff --git a/code/gen_tb.py b/code/gen_tb.py
index c53d57e0..83fbd8af 100644
--- a/code/gen_tb.py
+++ b/code/gen_tb.py
@@ -72,6 +72,7 @@
     "vmul": (widths_signed, "v128 a, v128 b"),
     "vmulwev": (widths_vaddw, "v128 a, v128 b"),
     "vmulwod": (widths_vaddw, "v128 a, v128 b"),
+    "vpcnt": (widths_signed, "v128 a"),
     "vsub": (widths_signed, "v128 a, v128 b"),
     "vsubwev": (widths_vsubw, "v128 a, v128 b"),
     "vsubwod": (widths_vsubw, "v128 a, v128 b"),
diff --git a/code/vpcnt_b.cpp b/code/vpcnt_b.cpp
new file mode 100644
index 00000000..701811b2
--- /dev/null
+++ b/code/vpcnt_b.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vpcnt_b(v128 a) {
+  v128 dst;
+#include "vpcnt_b.h"
+  return dst;
+}
+
+void test() { FUZZ1(vpcnt_b); }
diff --git a/code/vpcnt_b.h b/code/vpcnt_b.h
new file mode 100644
index 00000000..529750c7
--- /dev/null
+++ b/code/vpcnt_b.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 16; i++) {
+  dst.byte[i] = popcount(a.byte[i]);
+}
diff --git a/code/vpcnt_d.cpp b/code/vpcnt_d.cpp
new file mode 100644
index 00000000..9dfb88be
--- /dev/null
+++ b/code/vpcnt_d.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vpcnt_d(v128 a) {
+  v128 dst;
+#include "vpcnt_d.h"
+  return dst;
+}
+
+void test() { FUZZ1(vpcnt_d); }
diff --git a/code/vpcnt_d.h b/code/vpcnt_d.h
new file mode 100644
index 00000000..4ae85315
--- /dev/null
+++ b/code/vpcnt_d.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 2; i++) {
+  dst.dword[i] = popcount(a.dword[i]);
+}
diff --git a/code/vpcnt_h.cpp b/code/vpcnt_h.cpp
new file mode 100644
index 00000000..666f65eb
--- /dev/null
+++ b/code/vpcnt_h.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vpcnt_h(v128 a) {
+  v128 dst;
+#include "vpcnt_h.h"
+  return dst;
+}
+
+void test() { FUZZ1(vpcnt_h); }
diff --git a/code/vpcnt_h.h b/code/vpcnt_h.h
new file mode 100644
index 00000000..2e751d4d
--- /dev/null
+++ b/code/vpcnt_h.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 8; i++) {
+  dst.half[i] = popcount(a.half[i]);
+}
diff --git a/code/vpcnt_w.cpp b/code/vpcnt_w.cpp
new file mode 100644
index 00000000..fbc0719b
--- /dev/null
+++ b/code/vpcnt_w.cpp
@@ -0,0 +1,9 @@
+#include "common.h"
+
+v128 vpcnt_w(v128 a) {
+  v128 dst;
+#include "vpcnt_w.h"
+  return dst;
+}
+
+void test() { FUZZ1(vpcnt_w); }
diff --git a/code/vpcnt_w.h b/code/vpcnt_w.h
new file mode 100644
index 00000000..b33f0872
--- /dev/null
+++ b/code/vpcnt_w.h
@@ -0,0 +1,3 @@
+for (int i = 0; i < 4; i++) {
+  dst.word[i] = popcount(a.word[i]);
+}
diff --git a/docs/lsx/bitwise_operations.md b/docs/lsx/bitwise_operations.md
index aac9c42e..a40d243e 100644
--- a/docs/lsx/bitwise_operations.md
+++ b/docs/lsx/bitwise_operations.md
@@ -98,3 +98,8 @@ Compute bitwise selection: for each bit position, if the bit in `a` equals to on
 {{ vextrins('h') }}
 {{ vextrins('w') }}
 {{ vextrins('d') }}
+
+{{ vpcnt('b') }}
+{{ vpcnt('h') }}
+{{ vpcnt('w') }}
+{{ vpcnt('d') }}
\ No newline at end of file
diff --git a/main.py b/main.py
index 39db0e8a..2af30f41 100644
--- a/main.py
+++ b/main.py
@@ -524,4 +524,13 @@ def vmsub(name):
             intrinsic=f"__m128i __lsx_vmsub_{name} (__m128i a, __m128i b, __m128i c)",
             instr=f"vmsub.{name} vr, vr, vr",
             desc=f"Multiply {width}-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.",
+        )
+
+    @env.macro
+    def vpcnt(name):
+        width = widths[name]
+        return instruction(
+            intrinsic=f"__m128i __lsx_pcnt_{name} (__m128i a)",
+            instr=f"vpcnt.{name} vr, vr",
+            desc=f"Count the number of ones in {width}-bit elements in `a`.",
         )
\ No newline at end of file