From 0401227d0ef8176244a9a8dbf609c133247b3058 Mon Sep 17 00:00:00 2001
From: Jiajie Chen <c@jia.je>
Date: Wed, 13 Dec 2023 15:50:29 +0800
Subject: [PATCH] Fix float fp32 element count

---
 code/gen_impl.py |  6 ++++++
 code/vfadd_d.cpp | 19 +++++++++++++++++++
 code/vfadd_s.cpp | 19 +++++++++++++++++++
 code/vfadd_s.h   |  2 +-
 code/vfdiv_s.h   |  2 +-
 code/vfmax_s.h   |  2 +-
 code/vfmaxa_s.h  |  2 +-
 code/vfmin_s.h   |  2 +-
 code/vfmina_s.h  |  2 +-
 code/vfmul_s.h   |  2 +-
 code/vfsub_d.cpp | 19 +++++++++++++++++++
 code/vfsub_s.cpp | 19 +++++++++++++++++++
 code/vfsub_s.h   |  2 +-
 13 files changed, 90 insertions(+), 8 deletions(-)
 create mode 100644 code/vfadd_d.cpp
 create mode 100644 code/vfadd_s.cpp
 create mode 100644 code/vfsub_d.cpp
 create mode 100644 code/vfsub_s.cpp

diff --git a/code/gen_impl.py b/code/gen_impl.py
index 9d05528f..7f66212f 100644
--- a/code/gen_impl.py
+++ b/code/gen_impl.py
@@ -53,6 +53,11 @@
     "d": "fp64",
 }
 
+widths_fp = {
+    "s": 32,
+    "d": 64,
+}
+
 for width in ["b", "bu", "h", "hu", "w", "wu", "d", "du"]:
     w = widths[width]
     m = members[width]
@@ -763,6 +768,7 @@
 
 for width in ["s", "d"]:
     m = members_fp[width]
+    w = widths_fp[width]
     for name, op in [("div", "/"), ("mul", "*"), ("sub", "-"), ("add", "+")]:
         with open(f"vf{name}_{width}.h", "w") as f:
             print(f"for (int i = 0;i < {128 // w};i++) {{", file=f)
diff --git a/code/vfadd_d.cpp b/code/vfadd_d.cpp
new file mode 100644
index 00000000..63fd65a6
--- /dev/null
+++ b/code/vfadd_d.cpp
@@ -0,0 +1,19 @@
+#include "common.h"
+
+v128 vfadd_d(v128 a, v128 b) {
+  v128 dst;
+#include "vfadd_d.h"
+  return dst;
+}
+
+void test() {
+  {
+    __m128d a = {1.0, 2.0};
+    __m128d b = {5.0, 6.0};
+    PRINT(a);
+    PRINT(b);
+    PRINT(__lsx_vfadd_d(a, b));
+    PRINT(vfadd_d(a, b));
+    assert(vfadd_d(a, b) == __lsx_vfadd_d(a, b));
+  }
+}
diff --git a/code/vfadd_s.cpp b/code/vfadd_s.cpp
new file mode 100644
index 00000000..cb328089
--- /dev/null
+++ b/code/vfadd_s.cpp
@@ -0,0 +1,19 @@
+#include "common.h"
+
+v128 vfadd_s(v128 a, v128 b) {
+  v128 dst;
+#include "vfadd_s.h"
+  return dst;
+}
+
+void test() {
+  {
+    __m128 a = {1.0, 2.0, 3.0, 4.0};
+    __m128 b = {5.0, 6.0, 7.0, 8.0};
+    PRINT(a);
+    PRINT(b);
+    PRINT(__lsx_vfadd_s(a, b));
+    PRINT(vfadd_s(a, b));
+    assert(vfadd_s(a, b) == __lsx_vfadd_s(a, b));
+  }
+}
diff --git a/code/vfadd_s.h b/code/vfadd_s.h
index 3f67c7f7..401f4385 100644
--- a/code/vfadd_s.h
+++ b/code/vfadd_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = a.fp32[i] + b.fp32[i];
 }
diff --git a/code/vfdiv_s.h b/code/vfdiv_s.h
index a13b10ee..34f2464a 100644
--- a/code/vfdiv_s.h
+++ b/code/vfdiv_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = a.fp32[i] / b.fp32[i];
 }
diff --git a/code/vfmax_s.h b/code/vfmax_s.h
index 85a2b177..f30ee8f5 100644
--- a/code/vfmax_s.h
+++ b/code/vfmax_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
 }
diff --git a/code/vfmaxa_s.h b/code/vfmaxa_s.h
index 90c964e9..2c703f1a 100644
--- a/code/vfmaxa_s.h
+++ b/code/vfmaxa_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
 }
diff --git a/code/vfmin_s.h b/code/vfmin_s.h
index 85a2b177..f30ee8f5 100644
--- a/code/vfmin_s.h
+++ b/code/vfmin_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
 }
diff --git a/code/vfmina_s.h b/code/vfmina_s.h
index da77ed57..834c5a0a 100644
--- a/code/vfmina_s.h
+++ b/code/vfmina_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
 }
diff --git a/code/vfmul_s.h b/code/vfmul_s.h
index 93fc790b..13c64799 100644
--- a/code/vfmul_s.h
+++ b/code/vfmul_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = a.fp32[i] * b.fp32[i];
 }
diff --git a/code/vfsub_d.cpp b/code/vfsub_d.cpp
new file mode 100644
index 00000000..e1be91c0
--- /dev/null
+++ b/code/vfsub_d.cpp
@@ -0,0 +1,19 @@
+#include "common.h"
+
+v128 vfsub_d(v128 a, v128 b) {
+  v128 dst;
+#include "vfsub_d.h"
+  return dst;
+}
+
+void test() {
+  {
+    __m128d a = {1.0, 2.0};
+    __m128d b = {5.0, 6.0};
+    PRINT(a);
+    PRINT(b);
+    PRINT(__lsx_vfsub_d(a, b));
+    PRINT(vfsub_d(a, b));
+    assert(vfsub_d(a, b) == __lsx_vfsub_d(a, b));
+  }
+}
diff --git a/code/vfsub_s.cpp b/code/vfsub_s.cpp
new file mode 100644
index 00000000..3042e3f3
--- /dev/null
+++ b/code/vfsub_s.cpp
@@ -0,0 +1,19 @@
+#include "common.h"
+
+v128 vfsub_s(v128 a, v128 b) {
+  v128 dst;
+#include "vfsub_s.h"
+  return dst;
+}
+
+void test() {
+  {
+    __m128 a = {1.0, 2.0, 3.0, 4.0};
+    __m128 b = {5.0, 6.0, 7.0, 8.0};
+    PRINT(a);
+    PRINT(b);
+    PRINT(__lsx_vfsub_s(a, b));
+    PRINT(vfsub_s(a, b));
+    assert(vfsub_s(a, b) == __lsx_vfsub_s(a, b));
+  }
+}
diff --git a/code/vfsub_s.h b/code/vfsub_s.h
index 4937b20b..efe840f9 100644
--- a/code/vfsub_s.h
+++ b/code/vfsub_s.h
@@ -1,3 +1,3 @@
-for (int i = 0; i < 2; i++) {
+for (int i = 0; i < 4; i++) {
   dst.fp32[i] = a.fp32[i] - b.fp32[i];
 }