From 11729587dd5114d82ae2abf31cb8de6f99e372a7 Mon Sep 17 00:00:00 2001
From: Jiajie Chen <c@jia.je>
Date: Wed, 13 Dec 2023 20:41:25 +0800
Subject: [PATCH] Move all instructions to main.py

---
 code/vfclass_d.h               |   3 +
 code/vfclass_s.h               |   3 +
 code/vfmadd_d.h                |   3 +
 code/vfmadd_s.h                |   3 +
 code/vfmsub_d.h                |   3 +
 code/vfmsub_s.h                |   3 +
 code/vfnmadd_d.h               |   3 +
 code/vfnmadd_s.h               |   3 +
 code/vfnmsub_d.h               |   3 +
 code/vfnmsub_s.h               |   3 +
 docs/lsx/bitwise_operations.md |  42 +-------
 docs/lsx/float_conversion.md   | 130 ++---------------------
 docs/lsx/float_misc.md         |  48 +--------
 docs/lsx/fma.md                | 188 ++-------------------------------
 main.py                        | 145 +++++++++++++++++++++++++
 15 files changed, 193 insertions(+), 390 deletions(-)
 create mode 100644 code/vfclass_d.h
 create mode 100644 code/vfclass_s.h
 create mode 100644 code/vfmadd_d.h
 create mode 100644 code/vfmadd_s.h
 create mode 100644 code/vfmsub_d.h
 create mode 100644 code/vfmsub_s.h
 create mode 100644 code/vfnmadd_d.h
 create mode 100644 code/vfnmadd_s.h
 create mode 100644 code/vfnmsub_d.h
 create mode 100644 code/vfnmsub_s.h

diff --git a/code/vfclass_d.h b/code/vfclass_d.h
new file mode 100644
index 00000000..3a148e0a
--- /dev/null
+++ b/code/vfclass_d.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 2;i++) {
+    dst.dword[i] = fp_classify(a.fp64[i]);
+}
\ No newline at end of file
diff --git a/code/vfclass_s.h b/code/vfclass_s.h
new file mode 100644
index 00000000..e11265f4
--- /dev/null
+++ b/code/vfclass_s.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 4;i++) {
+    dst.word[i] = fp_classify(a.fp32[i]);
+}
\ No newline at end of file
diff --git a/code/vfmadd_d.h b/code/vfmadd_d.h
new file mode 100644
index 00000000..03b8f030
--- /dev/null
+++ b/code/vfmadd_d.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 2;i++) {
+    dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
\ No newline at end of file
diff --git a/code/vfmadd_s.h b/code/vfmadd_s.h
new file mode 100644
index 00000000..528eed2f
--- /dev/null
+++ b/code/vfmadd_s.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 4;i++) {
+    dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
\ No newline at end of file
diff --git a/code/vfmsub_d.h b/code/vfmsub_d.h
new file mode 100644
index 00000000..cf80e91e
--- /dev/null
+++ b/code/vfmsub_d.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 2;i++) {
+    dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
\ No newline at end of file
diff --git a/code/vfmsub_s.h b/code/vfmsub_s.h
new file mode 100644
index 00000000..f5d2db36
--- /dev/null
+++ b/code/vfmsub_s.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 4;i++) {
+    dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
\ No newline at end of file
diff --git a/code/vfnmadd_d.h b/code/vfnmadd_d.h
new file mode 100644
index 00000000..6672c0c4
--- /dev/null
+++ b/code/vfnmadd_d.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 2;i++) {
+    dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
\ No newline at end of file
diff --git a/code/vfnmadd_s.h b/code/vfnmadd_s.h
new file mode 100644
index 00000000..553d1eef
--- /dev/null
+++ b/code/vfnmadd_s.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 4;i++) {
+    dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
\ No newline at end of file
diff --git a/code/vfnmsub_d.h b/code/vfnmsub_d.h
new file mode 100644
index 00000000..5abbb7a6
--- /dev/null
+++ b/code/vfnmsub_d.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 2;i++) {
+    dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
\ No newline at end of file
diff --git a/code/vfnmsub_s.h b/code/vfnmsub_s.h
new file mode 100644
index 00000000..32b44b70
--- /dev/null
+++ b/code/vfnmsub_s.h
@@ -0,0 +1,3 @@
+for (int i = 0;i < 4;i++) {
+    dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
\ No newline at end of file
diff --git a/docs/lsx/bitwise_operations.md b/docs/lsx/bitwise_operations.md
index a40d243e..3753a4db 100644
--- a/docs/lsx/bitwise_operations.md
+++ b/docs/lsx/bitwise_operations.md
@@ -1,46 +1,8 @@
 # Bitwise Operations
 
-## __m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+{{ vbitsel_v() }}
 
-### Synopsis
-
-```c++
-__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
-#include <lsxintrin.h>
-Instruction: vbitsel.v vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute bitwise selection: for each bit position, if the bit in `c` equals to one, copy the bit from `b` to `dst`, otherwise copy from `a`.
-
-### Operation
-
-```c++
-{% include 'vbitsel_v.h' %}
-```
-
-## __m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
-
-### Synopsis
-
-```c++
-__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
-#include <lsxintrin.h>
-Instruction: vbitseli.b vr, vr, imm
-CPU Flags: LSX
-```
-
-### Description
-
-Compute bitwise selection: for each bit position, if the bit in `a` equals to one, copy the bit from `imm` to `dst`, otherwise copy from `b`.
-
-### Operation
-
-```c++
-{% include 'vbitseli_b.h' %}
-```
+{{ vbitseli_b() }}
 
 {{ vbitclr('b') }}
 {{ vbitclr('h') }}
diff --git a/docs/lsx/float_conversion.md b/docs/lsx/float_conversion.md
index a127b169..19261699 100644
--- a/docs/lsx/float_conversion.md
+++ b/docs/lsx/float_conversion.md
@@ -1,132 +1,14 @@
 # Floating Point Conversion
 
-## __m128d __lsx_vfcvth_d_s (__m128 a)
+{{ vfcvth_d_s() }}
+{{ vfcvtl_d_s() }}
 
-### Synopsis
+{{ vfcvt_s_d() }}
 
-```c++
-__m128d __lsx_vfcvth_d_s (__m128 a)
-#include <lsxintrin.h>
-Instruction: vfcvth.d.s vr, vr
-CPU Flags: LSX
-```
+{{ vfcvth_s_h() }}
+{{ vfcvtl_s_h() }}
 
-### Description
-
-Convert single precision floating point elements in higher half of `a` to double precision.
-
-### Operation
-
-```c++
-{% include('vfcvth_d_s.h') %}
-```
-
-## __m128d __lsx_vfcvtl_d_s (__m128 a)
-
-### Synopsis
-
-```c++
-__m128d __lsx_vfcvtl_d_s (__m128 a)
-#include <lsxintrin.h>
-Instruction: vfcvtl.d.s vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Convert single precision floating point elements in lower half of `a` to double precision.
-
-### Operation
-
-```c++
-{% include('vfcvtl_d_s.h') %}
-```
-
-
-## __m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)
-
-### Synopsis
-
-```c++
-__m128 __lsx_vfcvt_s_d (__m128a, __m128d b)
-#include <lsxintrin.h>
-Instruction: vfcvt.s.d vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Convert double precision floating point elements in `a` and `b` to double precision.
-
-### Operation
-
-```c++
-{% include('vfcvt_s_d.h') %}
-```
-
-## __m128 __lsx_vfcvth_s_h (__m128i a)
-
-### Synopsis
-
-```c++
-__m128 __lsx_vfcvth_s_h (__m128i a)
-#include <lsxintrin.h>
-Instruction: vfcvth.s.h vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Convert half precision floating point elements in higher half of `a` to single precision.
-
-### Operation
-
-```c++
-{% include('vfcvth_s_h.h') %}
-```
-
-## __m128 __lsx_vfcvtl_s_h (__m128i a)
-
-### Synopsis
-
-```c++
-__m128 __lsx_vfcvtl_s_h (__m128i a)
-#include <lsxintrin.h>
-Instruction: vfcvtl.s.h vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Convert half precision floating point elements in lower half of `a` to single precision.
-
-### Operation
-
-```c++
-{% include('vfcvtl_s_h.h') %}
-```
-
-
-## __m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)
-
-### Synopsis
-
-```c++
-__m128i __lsx_vfcvt_h_s (__m128, __m128 b)
-#include <lsxintrin.h>
-Instruction: vfcvt.h.s vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Convert single precision floating point elements in `a` and `b` to half precision.
-
-### Operation
-
-```c++
-{% include('vfcvt_h_s.h') %}
-```
+{{ vfcvt_h_s() }}
 
 {{ vffint_d_w('h') }}
 {{ vffint_d_w('l') }}
diff --git a/docs/lsx/float_misc.md b/docs/lsx/float_misc.md
index b6abf3cf..fa99b45a 100644
--- a/docs/lsx/float_misc.md
+++ b/docs/lsx/float_misc.md
@@ -1,51 +1,7 @@
 # Floatint Point Misc
 
-## __m128i __lsx_vfclass_d (__m128d a)
-
-### Synopsis
-
-```c++
-__m128i __lsx_vfclass_d (__m128d a)
-#include <lsxintrin.h>
-Instruction: vfclass.d vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Classifiy each double precision floating point elements in `a`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 2;i++) {
-    dst.dword[i] = fp_classify(a.fp64[i]);
-}
-```
-
-## __m128i __lsx_vfclass_s (__m128 a)
-
-### Synopsis
-
-```c++
-__m128i __lsx_vfclass_s (__m128d a)
-#include <lsxintrin.h>
-Instruction: vfclass.s vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Classifiy each single precision floating point elements in `a`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 4;i++) {
-    dst.word[i] = fp_classify(a.fp32[i]);
-}
-```
-
+{{ vclass_d() }}
+{{ vclass_s() }}
 
 {{ vfrint('', 's') }}
 {{ vfrint('', 'd') }}
diff --git a/docs/lsx/fma.md b/docs/lsx/fma.md
index 3fba949a..f1476d92 100644
--- a/docs/lsx/fma.md
+++ b/docs/lsx/fma.md
@@ -1,185 +1,13 @@
 # Fused Multiply-Add
 
-## __m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+{{ vfmadd_d() }}
+{{ vfmadd_s() }}
 
-### Synopsis
+{{ vfmsub_d() }}
+{{ vfmsub_s() }}
 
-```c++
-__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
-#include <lsxintrin.h>
-Instruction: vfmadd.s vr, vr, vr
-CPU Flags: LSX
-```
+{{ vfnmadd_d() }}
+{{ vfnmadd_s() }}
 
-### Description
-
-Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 4;i++) {
-    dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
-}
-```
-
-## __m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
-
-### Synopsis
-
-```c++
-__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
-#include <lsxintrin.h>
-Instruction: vfmadd.d vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 2;i++) {
-    dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
-}
-```
-
-## __m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
-
-### Synopsis
-
-```c++
-__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
-#include <lsxintrin.h>
-Instruction: vfmsub.s vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 4;i++) {
-    dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
-}
-```
-
-## __m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
-
-### Synopsis
-
-```c++
-__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
-#include <lsxintrin.h>
-Instruction: vfmsub.d vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 2;i++) {
-    dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
-}
-```
-
-## __m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
-
-### Synopsis
-
-```c++
-__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
-#include <lsxintrin.h>
-Instruction: vfnmadd.s vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 4;i++) {
-    dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
-}
-```
-
-## __m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
-
-### Synopsis
-
-```c++
-__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
-#include <lsxintrin.h>
-Instruction: vfnmadd.d vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 2;i++) {
-    dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
-}
-```
-
-## __m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
-
-### Synopsis
-
-```c++
-__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
-#include <lsxintrin.h>
-Instruction: vfnmsub.s vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 4;i++) {
-    dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
-}
-```
-
-## __m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
-
-### Synopsis
-
-```c++
-__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
-#include <lsxintrin.h>
-Instruction: vfnmsub.d vr, vr, vr
-CPU Flags: LSX
-```
-
-### Description
-
-Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.
-
-### Operation
-
-```c++
-for (int i = 0;i < 2;i++) {
-    dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
-}
-```
+{{ vfnmsub_d() }}
+{{ vfnmsub_s() }}
diff --git a/main.py b/main.py
index 29dceb0c..b336fbec 100644
--- a/main.py
+++ b/main.py
@@ -1457,4 +1457,149 @@ def vstx():
             intrinsic=f"void __lsx_vstx (__m128i data, void * addr, long int offset)",
             instr=f"vstx vr, r, r",
             desc=f"Write 128-bit data in `data` to memory address `addr + offset`.",
+        )
+
+    @env.macro
+    def vbitsel_v():
+        return instruction(
+            intrinsic=f"__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)",
+            instr=f"vbitsel.v vr, vr, vr",
+            desc=f"Compute bitwise selection: for each bit position, if the bit in `c` equals to one, copy the bit from `b` to `dst`, otherwise copy from `a`.",
+        )
+
+    @env.macro
+    def vbitseli_b():
+        return instruction(
+            intrinsic=f"__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)",
+            instr=f"vbitseli.b vr, vr, imm",
+            desc=f"Compute bitwise selection: for each bit position, if the bit in `a` equals to one, copy the bit from `imm` to `dst`, otherwise copy from `b`.",
+        )
+
+    @env.macro
+    def vfcvth_d_s():
+        return instruction(
+            intrinsic=f"__m128d __lsx_vfcvth_d_s (__m128 a)",
+            instr=f"vfcvth.d.s vr, vr",
+            desc=f"Convert single precision floating point elements in higher half of `a` to double precision.",
+        )
+
+    @env.macro
+    def vfcvtl_d_s():
+        return instruction(
+            intrinsic=f"__m128d __lsx_vfcvtl_d_s (__m128 a)",
+            instr=f"vfcvtl.d.s vr, vr",
+            desc=f"Convert single precision floating point elements in lower half of `a` to double precision.",
+        )
+
+    @env.macro
+    def vfcvt_s_d():
+        return instruction(
+            intrinsic=f"__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)",
+            instr=f"vfcvt.s.d vr, vr, vr",
+            desc=f"Convert double precision floating point elements in `a` and `b` to double precision.",
+        )
+
+    @env.macro
+    def vfcvth_s_h():
+        return instruction(
+            intrinsic=f"__m128 __lsx_vfcvth_s_h (__m128i a)",
+            instr=f"vfcvth.s.h vr, vr",
+            desc=f"Convert half precision floating point elements in higher half of `a` to single precision.",
+        )
+
+    @env.macro
+    def vfcvtl_s_h():
+        return instruction(
+            intrinsic=f"__m128 __lsx_vfcvtl_s_h (__m128i a)",
+            instr=f"vfcvtl.s.h vr, vr",
+            desc=f"Convert half precision floating point elements in lower half of `a` to single precision.",
+        )
+
+
+    @env.macro
+    def vfcvt_h_s():
+        return instruction(
+            intrinsic=f"__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)",
+            instr=f"vfcvt.h.s vr, vr, vr",
+            desc=f"Convert single precision floating point elements in `a` and `b` to half precision.",
+        )
+
+    @env.macro
+    def vfclass_d():
+        return instruction(
+            intrinsic=f"__m128i __lsx_vfclass_d (__m128d a)",
+            instr=f"vfclass.d vr, vr",
+            desc=f"Classifiy each double precision floating point elements in `a`.",
+        )
+
+    @env.macro
+    def vfclass_s():
+        return instruction(
+            intrinsic=f"__m128i __lsx_vfclass_s (__m128 a)",
+            instr=f"vfclass.s vr, vr",
+            desc=f"Classifiy each single precision floating point elements in `a`.",
+        )
+
+    @env.macro
+    def vfmadd_s():
+        return instruction(
+            intrinsic=f"__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)",
+            instr=f"vfmadd.s vr, vr, vr",
+            desc=f"Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.",
+        )
+
+    @env.macro
+    def vfmadd_d():
+        return instruction(
+            intrinsic=f"__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)",
+            instr=f"vfmadd.d vr, vr, vr",
+            desc=f"Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.",
+        )
+
+    @env.macro
+    def vfmsub_s():
+        return instruction(
+            intrinsic=f"__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)",
+            instr=f"vfmsub.s vr, vr, vr",
+            desc=f"Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.",
+        )
+
+    @env.macro
+    def vfmsub_d():
+        return instruction(
+            intrinsic=f"__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)",
+            instr=f"vfmsub.d vr, vr, vr",
+            desc=f"Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.",
+        )
+
+    @env.macro
+    def vfnmadd_s():
+        return instruction(
+            intrinsic=f"__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)",
+            instr=f"vfnmadd.s vr, vr, vr",
+            desc=f"Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.",
+        )
+
+    @env.macro
+    def vfnmadd_d():
+        return instruction(
+            intrinsic=f"__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)",
+            instr=f"vfnmadd.d vr, vr, vr",
+            desc=f"Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.",
+        )
+
+    @env.macro
+    def vfnmsub_s():
+        return instruction(
+            intrinsic=f"__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)",
+            instr=f"vfnmsub.s vr, vr, vr",
+            desc=f"Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.",
+        )
+
+    @env.macro
+    def vfnmsub_d():
+        return instruction(
+            intrinsic=f"__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)",
+            instr=f"vfnmsub.d vr, vr, vr",
+            desc=f"Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.",
         )
\ No newline at end of file