404
+ +Page not found
+ + +diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..f6eae60a --- /dev/null +++ b/404.html @@ -0,0 +1,184 @@ + + +
+ + + + +Page not found
+ + +This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:
+The guide provides pseudo code for the SIMD intrinsics. The code assumes that the elements of the LSX/LASX vector registers can be accessed via members of a union
:
union lsx_register {
+ uint8_t byte[16];
+ uint16_t half[8];
+ uint32_t word[4];
+ uint64_t dword[2];
+ uint128_t qword[1];
+ float fp32[4];
+ double fp64[2];
+};
+
+union lasx_register {
+ uint8_t byte[32];
+ uint16_t half[16];
+ uint32_t word[8];
+ uint64_t dword[4];
+ uint128_t qword[2];
+ float fp32[8];
+ double fp64[4];
+};
+
+
+ __m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvbitsel.v xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise selection: for each bit position, if the bit in c
equals to one, copy the bit from b
to dst
, otherwise copy from a
.
__m256i __lasx_xvbitsel_v(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0xffff0000aaaabbbb, 0x1111222233334444, 0x00000000ffffffff, 0xffffffff00000000})
+= 0xabab3344ffeeefab 0x98ba9beccfedfb00 0xabcdef1243214321 0x56785678ddeeddee
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseli.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise selection: for each bit position, if the bit in a
equals to one, copy the bit from imm
to dst
, otherwise copy from b
.
__m256i __lasx_xvbitseli_b( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0xba8b9aabba8b9a23 0x1216123012031221 0x1230123653115311 0x5652565212121212
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.b xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclr_b(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700 0xabcdeb0212341234 0xaabaaaba9dee9dee
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.h xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclr_h(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xf7fff7fff7fff7ff 0x99aabbccddecff00 0xabcdef0212341234 0xaabbaabbdceedcee
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.w xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclr_w(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xfffff7fffffff7ff 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbdceeddee
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.d xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclr_d(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xfffff7ffffffffff 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaabbddeeddee
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.b xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclri_b( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00 0xa9cded1010341034 0xa8b9a8b9ddecddec
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.h xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclri_h( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffdfffdfffdfffd 0x99a8bbccddecff00 0xabcdef1012341234 0xaab9aab9ddecddec
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.w xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclri_w( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffffffdfffffffd 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaab9ddeeddec
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.d xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitclri_d( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffffffffffffffd 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbddeeddec
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitset_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.b xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitset_b(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0808080808080808 0x9dbabfdcddeeff02 0xafddef121a361a36 0xeabbeabbddefddef
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitset_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.h xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitset_h(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0800080008000800 0x99babbdcddeeff02 0xabddef1212361236 0xabbbabbbddeeddee
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitset_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.w xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitset_w(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0000080000000800 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbddeeddee
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitset_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.d xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitset_d(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0000080000000000 0x99aabbceddeeff00 0xabcdef1212341234 0xabbbaabbddeeddee
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.b xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitseti_b( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0202020202020202 0x9baabbcedfeeff02 0xabcfef1212361236 0xaabbaabbdfeedfee
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.h xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitseti_h( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0002000200020002 0x99aabbceddeeff02 0xabcfef1212361236 0xaabbaabbddeeddee
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.w xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitseti_w( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0000000200000002 0x99aabbceddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.d xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitseti_d( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0000000000000002 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.b xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrev_b(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0707070707070707 0x9dbabfdcd5ecf702 0xafddeb021a361a36 0xeabaeaba9def9def
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.h xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrev_h(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x070f070f070f070f 0x99babbdcddecff02 0xabddef0212361236 0xabbbabbbdceedcee
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.w xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrev_w(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0f0f070f0f0f070f 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbdceeddee
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.d xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrev_d(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00 0xabcdef1012341234 0xabbbaabbddeeddee
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.b xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrevi_b( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02 0xa9cfed1010361036 0xa8b9a8b9dfecdfec
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.h xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrevi_h( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02 0xabcfef1012361236 0xaab9aab9ddecddec
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.w xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrevi_w( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02 0xabcdef1012341236 0xaabbaab9ddeeddec
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.d xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
__m256i __lasx_xvbitrevi_d( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddec
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclo_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.b xr, xr
+CPU Flags: LASX
+
+Count leading ones of 8-bit elements in a
.
__m256i __lasx_xvclo_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000001 0x0101010202030800 0x0102030000000000 0x0101010102030203
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clo(a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclo_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.h xr, xr
+CPU Flags: LASX
+
+Count leading ones of 16-bit elements in a
.
__m256i __lasx_xvclo_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0001000100020008 0x0001000300000000 0x0001000100020002
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = clo(a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclo_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.w xr, xr
+CPU Flags: LASX
+
+Count leading ones of 32-bit elements in a
.
__m256i __lasx_xvclo_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0000000100000002 0x0000000100000000 0x0000000100000002
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = clo(a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclo_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.d xr, xr
+CPU Flags: LASX
+
+Count leading ones of 64-bit elements in a
.
__m256i __lasx_xvclo_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0000000000000001 0x0000000000000001 0x0000000000000001
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clo(a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.b xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 8-bit elements in a
.
__m256i __lasx_xvclz_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0302020101010100 0x0000000000000008 0x0000000303020302 0x0000000000000000
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clz(a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.h xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 16-bit elements in a
.
__m256i __lasx_xvclz_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0003000200010001 0x0000000000000000 0x0000000000030003 0x0000000000000000
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = clz(a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.w xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 32-bit elements in a
.
__m256i __lasx_xvclz_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000300000001 0x0000000000000000 0x0000000000000003 0x0000000000000000
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = clz(a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvclz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.d xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 64-bit elements in a
.
__m256i __lasx_xvclz_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000003 0x0000000000000000 0x0000000000000000 0x0000000000000000
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clz(a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvpcnt_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.b xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 8-bit elements in a
.
__m256i __lasx_xvpcnt_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0202040204040602 0x0404060406060800 0x0505070202030203 0x0406040606060606
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = popcount(a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvpcnt_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.h xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 16-bit elements in a
.
__m256i __lasx_xvpcnt_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0004000600080008 0x0008000a000c0008 0x000a000900050005 0x000a000a000c000c
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = popcount(a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvpcnt_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.w xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 32-bit elements in a
.
__m256i __lasx_xvpcnt_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000a00000010 0x0000001200000014 0x000000130000000a 0x0000001400000018
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = popcount(a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvpcnt_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.d xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 64-bit elements in a
.
__m256i __lasx_xvpcnt_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x000000000000001a 0x0000000000000026 0x000000000000001d 0x000000000000002c
+
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = popcount(a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
int __lasx_xbz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvseteqz.v fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if the whole vector a
equals to zero.
dst = a.qword[0] == 0 && a.qword[1] == 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbnz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetnez.v fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if the whole vector a
is non-zero.
dst = a.qword[0] != 0 || a.qword[1] != 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.b fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 8-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 32; i++) {
+ if (a.byte[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.h fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 16-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 16; i++) {
+ if (a.half[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.w fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 32-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 8; i++) {
+ if (a.word[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.d fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 64-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 4; i++) {
+ if (a.dword[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbnz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.b fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 8-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 32; i++) {
+ if (a.byte[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbnz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.h fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 16-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 16; i++) {
+ if (a.half[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbnz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.w fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 32-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 8; i++) {
+ if (a.word[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lasx_xbnz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.d fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 64-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 4; i++) {
+ if (a.dword[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256 __lasx_xvfadd_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfadd.s xr, xr, xr
+CPU Flags: LASX
+
+Add single precision floating point elements in a
to elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m256d __lasx_xvfadd_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfadd.d xr, xr, xr
+CPU Flags: LASX
+
+Add double precision floating point elements in a
to elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.s xr, xr, xr
+CPU Flags: LASX
+
+Divide single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18(1/5.5) | +
3C5000 | +11, 19.5 | +0.1(1/10.5) | +
__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.d xr, xr, xr
+CPU Flags: LASX
+
+Divide double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 21.5 | +0.25(1/4) | +
3C5000 | +8, 17 | +0.08(1/12.5) | +
__m256 __lasx_xvfmax_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of single precision floating point elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256d __lasx_xvfmax_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of double precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.s xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.d xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256 __lasx_xvfmin_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of single precision floating point elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256d __lasx_xvfmin_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of double precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256 __lasx_xvfmina_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmina.s xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256d __lasx_xvfmina_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmina.d xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256 __lasx_xvfmul_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmul.s xr, xr, xr
+CPU Flags: LASX
+
+Multiply single precision floating point elements in a
and elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256d __lasx_xvfmul_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmul.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply double precision floating point elements in a
and elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256 __lasx_xvfsub_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfsub.s xr, xr, xr
+CPU Flags: LASX
+
+Subtract single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m256d __lasx_xvfsub_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfsub.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m256 __lasx_xvflogb_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvflogb.s xr, xr
+CPU Flags: LASX
+
+Compute 2-based logarithm of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = log2(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvflogb_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvflogb.d xr, xr
+CPU Flags: LASX
+
+Compute 2-based logarithm of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = log2(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvfsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.s xr, xr
+CPU Flags: LASX
+
+Compute square root of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +15 | +0.08(1/12) | +
3C5000 | +15 | +0.07(1/13.5) | +
__m256d __lasx_xvfsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.d xr, xr
+CPU Flags: LASX
+
+Compute square root of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +36 | +0.06(1/17.5) | +
3C5000 | +36 | +0.05(1/18.5) | +
__m256 __lasx_xvfrsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.s xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +25 | +0.05(1/19) | +
3C5000 | +25 | +0.03(1/32) | +
__m256d __lasx_xvfrsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.d xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +15 | +0.04(1/26.5) | +
3C5000 | +15 | +0.04(1/27.5) | +
__m256 __lasx_xvfrecip_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.s xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1 / a.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +27 | +0.18(1/5.5) | +
3C5000 | +27 | +0.14(1/7) | +
__m256d __lasx_xvfrecip_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.d xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1 / a.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +23 | +0.25(1/4) | +
3C5000 | +23 | +0.08(1/12) | +
__m256 __lasx_xvfrsqrte_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.s xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+__m256d __lasx_xvfrsqrte_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.d xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+__m256 __lasx_xvfrecipe_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.s xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+__m256d __lasx_xvfrecipe_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.d xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+
+ __m256d __lasx_xvfcvth_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.d.s xr, xr
+CPU Flags: LASX
+
+Convert single precision floating point elements in higher half of a
to double precision.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp32[4 + i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m256d __lasx_xvfcvtl_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.d.s xr, xr
+CPU Flags: LASX
+
+Convert single precision floating point elements in lower half of a
to double precision.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.s.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double precision floating point elements in a
and b
to single precision.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ dst.fp32[i] = b.fp64[i];
+ } else {
+ dst.fp32[i] = a.fp64[i - 4];
+ }
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m256 __lasx_xvfcvth_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.s.h xr, xr
+CPU Flags: LASX
+
+Convert half precision floating point elements in higher half of a
to single precision.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp16[8 + i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m256 __lasx_xvfcvtl_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.s.h xr, xr
+CPU Flags: LASX
+
+Convert half precision floating point elements in lower half of a
to single precision.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp16[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.h.s xr, xr, xr
+CPU Flags: LASX
+
+Convert single precision floating point elements in a
and b
to half precision.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ dst.fp16[i] = b.fp32[i];
+ } else {
+ dst.fp16[i] = a.fp32[i - 8];
+ }
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m256d __lasx_xvffinth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffinth.d.w xr, xr
+CPU Flags: LASX
+
+Convert 32-bit integer elements in higher part of a
to double precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256d __lasx_xvffintl_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffintl.d.w xr, xr
+CPU Flags: LASX
+
+Convert 32-bit integer elements in lower part of a
to double precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256d __lasx_xvffint_d_l (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.l xr, xr
+CPU Flags: LASX
+
+Convert signed 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvffint_d_lu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.lu xr, xr
+CPU Flags: LASX
+
+Convert unsigned 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvffint_s_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.w xr, xr
+CPU Flags: LASX
+
+Convert signed 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvffint_s_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.wu xr, xr
+CPU Flags: LASX
+
+Convert unsigned 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvffint.s.l xr, xr, xr
+CPU Flags: LASX
+
+Convert 64-bit integer elements in a
and b
to single-precision floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] =
+ (i < 4) ? (f32)(s32)a.dword[i]
+ : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintl.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftinth_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftinth.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrml_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrml.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrmh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrmh.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrpl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrpl.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrph_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrph.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrzl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzl.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrzh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzh.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrnel_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrnel.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrneh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrneh.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftint_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftint_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrm_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrm_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrp_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrp_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrz_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrz_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrne_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrne_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftint_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.lu.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftint_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.wu.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrz_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.lu.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftintrz_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.wu.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, rounding towards zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftint.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m256i __lasx_xvfclass_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfclass.d xr, xr
+CPU Flags: LASX
+
+Classifiy each double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfclass_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfclass.s xr, xr
+CPU Flags: LASX
+
+Classifiy each single precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256 __lasx_xvfrint_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrint.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvfrint_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrint.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvfrintrp_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvfrintrp_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvfrintrm_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvfrintrm_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvfrintrz_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvfrintrz_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256 __lasx_xvfrintrne_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvfrintrne_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m256i __lasx_xvseq_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.b xr, xr, xr
+CPU Flags: LASX
+
+Compare the 8-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvseq_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.h xr, xr, xr
+CPU Flags: LASX
+
+Compare the 16-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvseq_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.w xr, xr, xr
+CPU Flags: LASX
+
+Compare the 32-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvseq_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare the 64-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.b xr, xr, imm
+CPU Flags: LASX
+
+Compare the 8-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.h xr, xr, imm
+CPU Flags: LASX
+
+Compare the 16-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.w xr, xr, imm
+CPU Flags: LASX
+
+Compare the 32-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.d xr, xr, imm
+CPU Flags: LASX
+
+Compare the 64-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslt_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.b xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslt_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.bu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslt_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.h xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslt_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.hu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslt_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.w xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslt_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.wu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslt_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.d xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvslt_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.du xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.b xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.bu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.h xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.hu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.w xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.wu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.d xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.du xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsle_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.b xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsle_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.bu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsle_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.h xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsle_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.hu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsle_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.w xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsle_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.wu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsle_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.d xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsle_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.du xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.b xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.bu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.h xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.hu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.w xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.wu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.d xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.du xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.b xr, xr, xr
+CPU Flags: LASX
+
+Add 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.h xr, xr, xr
+CPU Flags: LASX
+
+Add 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] + b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.w xr, xr, xr
+CPU Flags: LASX
+
+Add 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] + b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.d xr, xr, xr
+CPU Flags: LASX
+
+Add 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvadd_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.q xr, xr, xr
+CPU Flags: LASX
+
+Add 128-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = a.qword[i] + b.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvabsd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.b xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvabsd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.h xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvabsd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.w xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvabsd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.d xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvabsd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.du xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvadda_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.b xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvadda_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.h xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvadda_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.w xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvadda_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.d xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.bu xr, xr, imm
+CPU Flags: LASX
+
+Add 8-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.hu xr, xr, imm
+CPU Flags: LASX
+
+Add 16-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.wu xr, xr, imm
+CPU Flags: LASX
+
+Add 32-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.du xr, xr, imm
+CPU Flags: LASX
+
+Add 64-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvavg_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.b xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavg_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavg_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.h xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavg_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavg_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.w xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavg_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavg_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.d xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvavg_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.du xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvavgr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.b xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavgr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.h xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavgr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.w xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvavgr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.d xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvavgr_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.du xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvdiv_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.b xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 32 | +0.06(1/15.5) | +
3C5000 | +32, 36 | +0.05(1/20.5) | +
__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.bu xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 33 | +0.06(1/16.5) | +
3C5000 | +29, 36 | +0.05(1/20.5) | +
__m256i __lasx_xvdiv_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.h xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17 | +0.12(1/8.5) | +
3C5000 | +21.5, 22 | +0.08(1/13) | +
__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.hu xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 22 | +0.11(1/9) | +
3C5000 | +17, 21.5 | +0.07(1/15) | +
__m256i __lasx_xvdiv_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.w xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18(1/5.5) | +
3C5000 | +11, 17.5 | +0.09(1/11.5) | +
__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.wu xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18(1/5.5) | +
3C5000 | +11, 17.5 | +0.07(1/15) | +
__m256i __lasx_xvdiv_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.d xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25(1/4) | +
3C5000 | +8, 18.5 | +0.11(1/9) | +
__m256i __lasx_xvdiv_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.du xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25(1/4) | +
3C5000 | +8, 18.5 | +0.11(1/9) | +
__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.h.b xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 8-bit elements in a
to even-positioned signed 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 8-bit elements in a
to even-positioned unsigned 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.w.h xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 16-bit elements in a
to even-positioned signed 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 16-bit elements in a
to even-positioned unsigned 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.d.w xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 32-bit elements in a
to even-positioned signed 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 32-bit elements in a
to even-positioned unsigned 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.q.d xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 64-bit elements in a
to even-positioned signed 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 64-bit elements in a
to even-positioned unsigned 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.h.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 8-bit elements in a
by even-positioned signed 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 8-bit elements in a
by even-positioned unsigned 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.w.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 16-bit elements in a
by even-positioned signed 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 16-bit elements in a
by even-positioned unsigned 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.d.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 32-bit elements in a
by even-positioned signed 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 32-bit elements in a
by even-positioned unsigned 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.q.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 64-bit elements in a
by even-positioned signed 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 64-bit elements in a
by even-positioned unsigned 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply 8-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply 16-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply 32-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply 64-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] =
+ (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+ (u32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+ (u64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+ (u128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m256i __lasx_xvmax_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.b xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmax_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmax_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.h xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmax_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmax_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.w xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmax_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmax_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.d xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmax_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.du xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.b xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.bu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.h xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.hu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.w xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.wu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.d xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.du xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmin_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.b xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmin_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmin_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.h xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmin_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmin_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.w xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmin_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmin_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.d xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmin_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.du xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.b xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.bu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.h xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.hu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.w xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.wu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.d xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.du xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.b xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 41 | +0.06(1/15.5) | +
3C5000 | +29, 33 | +0.05(1/21.5) | +
__m256i __lasx_xvmod_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.bu xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 37 | +0.06(1/17.5) | +
3C5000 | +29, 37 | +0.05(1/22) | +
__m256i __lasx_xvmod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.h xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 21 | +0.12(1/8.5) | +
3C5000 | +17, 21 | +0.07(1/13.5) | +
__m256i __lasx_xvmod_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.hu xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 25 | +0.11(1/9.5) | +
3C5000 | +17, 23 | +0.06(1/16) | +
__m256i __lasx_xvmod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.w xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 13 | +0.18(1/5.5) | +
3C5000 | +11, 15 | +0.07(1/13.5) | +
__m256i __lasx_xvmod_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.wu xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 13 | +0.18(1/5.5) | +
3C5000 | +11, 15 | +0.06(1/16) | +
__m256i __lasx_xvmod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.d xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25(1/4) | +
3C5000 | +8, 10 | +0.11(1/9.5) | +
__m256i __lasx_xvmod_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.du xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25(1/4) | +
3C5000 | +8, 10 | +0.11(1/9.5) | +
__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply 8-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply 16-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply 32-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply 64-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmuh_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmul_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmul_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] * b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmul_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] * b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmul_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m256i __lasx_xvneg_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.b xr, xr
+CPU Flags: LASX
+
+Negate 8-bit elements in a
and save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = -a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvneg_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.h xr, xr
+CPU Flags: LASX
+
+Negate 16-bit elements in a
and save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = -a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvneg_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.w xr, xr
+CPU Flags: LASX
+
+Negate 32-bit elements in a
and save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = -a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvneg_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.d xr, xr
+CPU Flags: LASX
+
+Negate 64-bit elements in a
and save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = -a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.b xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.bu xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.h xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.hu xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.w xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.wu xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.d xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsadd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.du xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.b xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.bu xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.h xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.hu xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.w xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.wu xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.d xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvssub_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.du xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] - b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] - b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsub_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.q xr, xr, xr
+CPU Flags: LASX
+
+Subtract 128-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = a.qword[i] - b.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.bu xr, xr, imm
+CPU Flags: LASX
+
+Subtract 8-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.hu xr, xr, imm
+CPU Flags: LASX
+
+Subtract 16-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.wu xr, xr, imm
+CPU Flags: LASX
+
+Subtract 32-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.du xr, xr, imm
+CPU Flags: LASX
+
+Subtract 64-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvand_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvand.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise AND between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvandi.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise AND between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] & imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvandn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvandn.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise ANDN between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvnor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvnor.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise NOR between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvnori.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise NOR between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvor.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise OR between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvori.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise OR between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] | imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvorn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvorn.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise ORN between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvxor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvxor.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise XOR between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvxori.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise XOR between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] ^ imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvld xr, r, imm
+CPU Flags: LASX
+
+Read whole vector from memory address addr + offset
, save the data into dst
. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.
dst = memory_load(256, addr + offset);
+
+__m256i __lasx_xvldx (void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvldx xr, r, r
+CPU Flags: LASX
+
+Read whole vector from memory address addr + offset
, save the data into dst
. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.
dst = memory_load(256, addr + offset);
+
+__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.b xr, r, imm
+CPU Flags: LASX
+
+Read 8-bit data from memory address addr + (offset << 0)
, replicate the data to all vector lanes and save into dst
.
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = data;
+}
+
+__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.h xr, r, imm
+CPU Flags: LASX
+
+Read 16-bit data from memory address addr + (offset << 1)
, replicate the data to all vector lanes and save into dst
.
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = data;
+}
+
+__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.w xr, r, imm
+CPU Flags: LASX
+
+Read 32-bit data from memory address addr + (offset << 2)
, replicate the data to all vector lanes and save into dst
.
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = data;
+}
+
+__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.d xr, r, imm
+CPU Flags: LASX
+
+Read 64-bit data from memory address addr + (offset << 3)
, replicate the data to all vector lanes and save into dst
.
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = data;
+}
+
+void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvst xr, r, imm
+CPU Flags: LASX
+
+Write whole vector data in data
to memory address addr + offset
.
memory_store(256, data, addr + offset);
+
+void __lasx_xvstx (__m256i data, void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvstx xr, r, r
+CPU Flags: LASX
+
+Write whole-vector data in data
to memory address addr + offset
.
memory_store(256, data, addr + offset);
+
+void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.b xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 8-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(8, data.byte[lane], addr + offset);
+
+void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.h xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 16-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(16, data.half[lane], addr + offset);
+
+void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.w xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 32-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(32, data.word[lane], addr + offset);
+
+void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.d xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 64-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(64, data.dword[lane], addr + offset);
+
+
+ __m256i __lasx_xvexth_h_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.h.b xr, xr
+CPU Flags: LASX
+
+Extend signed 8-bit elements in the higher half of a
to 16-bit.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[16 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvexth_hu_bu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.hu.bu xr, xr
+CPU Flags: LASX
+
+Extend unsigned 8-bit elements in the higher half of a
to 16-bit.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[16 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvexth_w_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.w.h xr, xr
+CPU Flags: LASX
+
+Extend signed 16-bit elements in the higher half of a
to 32-bit.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[8 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvexth_wu_hu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.wu.hu xr, xr
+CPU Flags: LASX
+
+Extend unsigned 16-bit elements in the higher half of a
to 32-bit.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[8 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvexth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.d.w xr, xr
+CPU Flags: LASX
+
+Extend signed 32-bit elements in the higher half of a
to 64-bit.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[4 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvexth_du_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.du.wu xr, xr
+CPU Flags: LASX
+
+Extend unsigned 32-bit elements in the higher half of a
to 64-bit.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[4 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvexth_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.q.d xr, xr
+CPU Flags: LASX
+
+Extend signed 64-bit elements in the higher half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvexth_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.qu.du xr, xr
+CPU Flags: LASX
+
+Extend unsigned 64-bit elements in the higher half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvextl_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.q.d xr, xr
+CPU Flags: LASX
+
+Extend signed 64-bit elements in the lower half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvextl_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.qu.du xr, xr
+CPU Flags: LASX
+
+Extend unsigned 64-bit elements in the lower half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.b xr, xr, imm
+CPU Flags: LASX
+
+Extract one 8-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 16; i++) {
+ dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+for (; i < 32; i++) {
+ dst.byte[i] =
+ (i - 16 == ((imm >> 4) & 15)) ? b.byte[(imm & 15) + 16] : a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.h xr, xr, imm
+CPU Flags: LASX
+
+Extract one 16-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (i - 8 == ((imm >> 4) & 7)) ? b.half[(imm & 7) + 8] : a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.w xr, xr, imm
+CPU Flags: LASX
+
+Extract one 32-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (i - 4 == ((imm >> 4) & 3)) ? b.word[(imm & 3) + 4] : a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.d xr, xr, imm
+CPU Flags: LASX
+
+Extract one 64-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+for (; i < 4; i++) {
+ dst.dword[i] =
+ (i - 2 == ((imm >> 4) & 1)) ? b.dword[(imm & 1) + 2] : a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_vext2xv_h_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.h.b xr, xr
+CPU Flags: LSX
+
+Extend signed 8-bit lane of a
to signed 16-bit elements.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_hu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.hu.bu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 8-bit lane of a
to unsigned 16-bit elements.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_w_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.b xr, xr
+CPU Flags: LSX
+
+Extend signed 8-bit lane of a
to signed 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s8)a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_wu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.bu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 8-bit lane of a
to unsigned 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u8)a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_w_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.h xr, xr
+CPU Flags: LSX
+
+Extend signed 16-bit lane of a
to signed 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_wu_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.hu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 16-bit lane of a
to unsigned 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_d_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.b xr, xr
+CPU Flags: LSX
+
+Extend signed 8-bit lane of a
to signed 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s8)a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_du_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.bu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 8-bit lane of a
to unsigned 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u8)a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_d_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.h xr, xr
+CPU Flags: LSX
+
+Extend signed 16-bit lane of a
to signed 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_du_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.hu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 16-bit lane of a
to unsigned 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_d_w (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.w xr, xr
+CPU Flags: LSX
+
+Extend signed 32-bit lane of a
to signed 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_vext2xv_du_wu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.wu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 32-bit lane of a
to unsigned 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvilvh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.b xr, xr, xr
+CPU Flags: LASX
+
+Interleave 8-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+for (; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvilvh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.h xr, xr, xr
+CPU Flags: LASX
+
+Interleave 16-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvilvh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.w xr, xr, xr
+CPU Flags: LASX
+
+Interleave 32-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvilvh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.d xr, xr, xr
+CPU Flags: LASX
+
+Interleave 64-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvilvl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.b xr, xr, xr
+CPU Flags: LASX
+
+Interleave 8-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+for (; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvilvl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.h xr, xr, xr
+CPU Flags: LASX
+
+Interleave 16-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvilvl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.w xr, xr, xr
+CPU Flags: LASX
+
+Interleave 32-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvilvl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.d xr, xr, xr
+CPU Flags: LASX
+
+Interleave 64-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.w xr, r, imm
+CPU Flags: LASX
+
+Insert 32-bit element into lane indexed imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.d xr, r, imm
+CPU Flags: LASX
+
+Insert 64-bit element into lane indexed imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.w xr, xr, imm
+CPU Flags: LASX
+
+Insert the first 32-bit lane of b
into lane indexed imm
of a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == imm) ? b.word[0] : a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.d xr, xr, imm
+CPU Flags: LASX
+
+Insert the first 64-bit lane of b
into lane indexed imm
of a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.b xr, xr, xr
+CPU Flags: LASX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[c.byte[0] % 16] = i;
+for (i = 16; i < 32; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[(c.byte[16] % 16) + 16] = i - 16;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.h xr, xr, xr
+CPU Flags: LASX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[c.half[0] % 8] = i;
+for (i = 8; i < 16; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[(c.half[8] % 8) + 8] = i - 8;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.b xr, xr, imm
+CPU Flags: LASX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[imm % 16] = i;
+for (i = 16; i < 32; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[(imm % 16) + 16] = i - 16;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.h xr, xr, imm
+CPU Flags: LASX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[imm % 8] = i;
+for (i = 8; i < 16; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[(imm % 8) + 8] = i - 8;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvmskgez_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskgez.b xr, xr
+CPU Flags: LASX
+
+For each 8-bit element in a
, if the element is greater than or equal to zero, set one bit in dst
, otherwise clear it.
__m256i __lasx_xvmskgez_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x00000000000001fe 0x0000000000000000 0x000000000000ff0f 0x0000000000000000
+__m256i __lasx_xvmskgez_b(__m256i{0x0000191100000000, 0x00a1000011b11c11, 0x1181000008010101, 0x0000000000000000})
+= 0x000000000000bbff 0x0000000000000000 0x000000000000ffbf 0x0000000000000000
+
+u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmskltz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.b xr, xr
+CPU Flags: LASX
+
+For each 8-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m256i __lasx_xvmskltz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000fe01 0x0000000000000000 0x00000000000000f0 0x0000000000000000
+__m256i __lasx_xvmskltz_b(__m256i{0x0000118100000000, 0x0081000081111118, 0x1181000001010801, 0x0000000000000000})
+= 0x0000000000004810 0x0000000000000000 0x0000000000000040 0x0000000000000000
+
+u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[3] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmskltz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.h xr, xr
+CPU Flags: LASX
+
+For each 16-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m256i __lasx_xvmskltz_h(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x00000000000000f0 0x0000000000000000 0x000000000000000c 0x0000000000000000
+__m256i __lasx_xvmskltz_h(__m256i{0x0000818100000000, 0x0018000018181881, 0x1181000008080808, 0x0000000000000000})
+= 0x0000000000000004 0x0000000000000000 0x0000000000000000 0x0000000000000000
+
+u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] |= c << 4;
+dst.dword[3] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmskltz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.w xr, xr
+CPU Flags: LASX
+
+For each 32-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m256i __lasx_xvmskltz_w(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000000c 0x0000000000000000 0x0000000000000002 0x0000000000000000
+__m256i __lasx_xvmskltz_w(__m256i{0x0000811100000000, 0x0018000081111111, 0x8111000001010108, 0x0000000000000000})
+= 0x0000000000000004 0x0000000000000000 0x0000000000000002 0x0000000000000000
+
+u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] |= c << 2;
+dst.dword[3] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmskltz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.d xr, xr
+CPU Flags: LASX
+
+For each 64-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m256i __lasx_xvmskltz_d(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x0000000000000002 0x0000000000000000 0x0000000000000001 0x0000000000000000
+__m256i __lasx_xvmskltz_d(__m256i{0x0000111800000000, 0x0081000081111111, 0x8111000008010101, 0x0000000000000000})
+= 0x0000000000000000 0x0000000000000000 0x0000000000000001 0x0000000000000000
+
+u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c >>= 63;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c >>= 63;
+dst.dword[2] |= c << 1;
+dst.dword[3] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvmsknz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmsknz.b xr, xr
+CPU Flags: LASX
+
+For each 8-bit element in a
, if the element is non-zero, set one bit in dst
, otherwise clear it.
__m256i __lasx_xvmsknz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000feff 0x0000000000000000 0x000000000000ffff 0x0000000000000000
+__m256i __lasx_xvmsknz_b(__m256i{0x0000111100000000, 0x0011000011111111, 0x1111000001010101, 0x0000000000000000})
+= 0x0000000000004f30 0x0000000000000000 0x00000000000000cf 0x0000000000000000
+
+u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = ~(((a.dword[2] & m) + m) | a.dword[2] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = ~(((a.dword[3] & m) + m) | a.dword[3] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.b xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.h xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.w xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.d xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.b xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.h xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.w xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpackod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.d xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.b xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 8-bit elements in b
first, then pick even-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.h xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 16-bit elements in b
first, then pick even-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.w xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 32-bit elements in b
first, then pick even-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.d xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 64-bit elements in b
first, then pick even-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+Copy one 32-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+Copy one 64-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+Copy one 32-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+Copy one 64-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.w r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s32)a.word[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.wu r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u32)a.word[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.d r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s64)a.dword[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.du r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u64)a.dword[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m256i __lasx_xvpickod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.b xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 8-bit elements in b
first, then pick odd-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.h xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 16-bit elements in b
first, then pick odd-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.w xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 32-bit elements in b
first, then pick odd-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpickod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.d xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 64-bit elements in b
first, then pick odd-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvrepli_b (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvrepli_h (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvrepli_w (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvrepli_d (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvreplgr2vr_b (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.b xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m256i __lasx_xvreplgr2vr_h (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.h xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m256i __lasx_xvreplgr2vr_w (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.w xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m256i __lasx_xvreplgr2vr_d (long int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.d xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m256i __lasx_xvreplve_b (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.b xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx % 16];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = a.byte[(idx % 16) + 16];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m256i __lasx_xvreplve_h (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.h xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx % 8];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = a.half[(idx % 8) + 8];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m256i __lasx_xvreplve_w (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.w xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx % 4];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = a.word[(idx % 4) + 4];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m256i __lasx_xvreplve_d (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.d xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx % 2];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = a.dword[(idx % 2) + 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m256i __lasx_xvreplve0_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.b xr, xr
+CPU Flags: LASX
+
+Repeat the first 8-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[0];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvreplve0_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.h xr, xr
+CPU Flags: LASX
+
+Repeat the first 16-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[0];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvreplve0_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.w xr, xr
+CPU Flags: LASX
+
+Repeat the first 32-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[0];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvreplve0_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.d xr, xr
+CPU Flags: LASX
+
+Repeat the first 64-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[0];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvreplve0_q (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.q xr, xr
+CPU Flags: LASX
+
+Repeat the first 128-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = a.qword[0];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.b xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = a.byte[idx + 16];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.h xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = a.half[idx + 8];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.w xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = a.word[idx + 4];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.d xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = a.dword[idx + 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.b xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.bu xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.h xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.hu xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.w xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.wu xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.d xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.du xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.b xr, xr, xr
+CPU Flags: LASX
+
+If the 8-bit element in a
equals to zero, set the result to zero. If the signed 8-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] =
+ (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.h xr, xr, xr
+CPU Flags: LASX
+
+If the 16-bit element in a
equals to zero, set the result to zero. If the signed 16-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.w xr, xr, xr
+CPU Flags: LASX
+
+If the 32-bit element in a
equals to zero, set the result to zero. If the signed 32-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.d xr, xr, xr
+CPU Flags: LASX
+
+If the 64-bit element in a
equals to zero, set the result to zero. If the signed 64-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvldi (imm_n1024_1023 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Initialize dst
using predefined patterns:
imm[12:10]=0b000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:10]=0b001
: broadcast sign-extended imm[9:0]
as 16-bit elements to all lanesimm[12:10]=0b010
: broadcast sign-extended imm[9:0]
as 32-bit elements to all lanesimm[12:10]=0b011
: broadcast sign-extended imm[9:0]
as 64-bit elements to all lanesimm[12:8]=0b10000
: broadcast imm[7:0]
as 32-bit elements to all lanesimm[12:8]=0b10001
: broadcast imm[7:0] << 8
as 32-bit elements to all lanesimm[12:8]=0b10010
: broadcast imm[7:0] << 16
as 32-bit elements to all lanesimm[12:8]=0b10011
: broadcast imm[7:0] << 24
as 32-bit elements to all lanesimm[12:8]=0b10100
: broadcast imm[7:0]
as 16-bit elements to all lanesimm[12:8]=0b10101
: broadcast imm[7:0] << 8
as 16-bit elements to all lanesimm[12:8]=0b10110
: broadcast (imm[7:0] << 8) | 0xFF
as 32-bit elements to all lanesimm[12:8]=0b10111
: broadcast (imm[7:0] << 16) | 0xFFFF
as 32-bit elements to all lanesimm[12:8]=0b11000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:8]=0b11001
: repeat each bit of imm[7:0]
eight times, and broadcast the result as 64-bit elements to all lanesimm[12:8]=0b11010
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 32-bit elements to all lanesimm[12:8]=0b11011
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 64-bit elements to all lanesimm[12:8]=0b11100
: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)
as 64-bit elements to all lanesu64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+ broadcast_value = simm9_0;
+ broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+ broadcast_value = simm9_0;
+ broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+ broadcast_value = simm9_0;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+ broadcast_value = imm7_0 << 16;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+ broadcast_value = imm7_0 << 24;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+ broadcast_value = imm7_0;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+ broadcast_value = (imm7_0 << 8) | 0xFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+ broadcast_value = (imm7_0 << 16) | 0xFFFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+ broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+ imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+ imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+ imm7 * 0xFF00000000000000;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+ broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+ (imm5_0 << 48);
+ broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+ for (int i = 0; i < 32; i++) {
+ dst.byte[i] = broadcast_value;
+ }
+} else if (broadcast_width == 16) {
+ for (int i = 0; i < 16; i++) {
+ dst.half[i] = broadcast_value;
+ }
+} else if (broadcast_width == 32) {
+ for (int i = 0; i < 8; i++) {
+ dst.word[i] = broadcast_value;
+ }
+} else if (broadcast_width == 64) {
+ for (int i = 0; i < 4; i++) {
+ dst.dword[i] = broadcast_value;
+ }
+}
+
+
+ __m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.w xr, xr, imm
+CPU Flags: LASX
+
+Permute words from a
and b
with indices recorded in imm
and store into dst
.
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+dst.word[4] = b.word[4 + (imm & 0x3)];
+dst.word[5] = b.word[4 + ((imm >> 2) & 0x3)];
+dst.word[6] = a.word[4 + ((imm >> 4) & 0x3)];
+dst.word[7] = a.word[4 + ((imm >> 6) & 0x3)];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+Permute double words from a
and b
with indices recorded in imm
and store into dst
.
dst.dword[0] = a.dword[imm & 0x3];
+dst.dword[1] = a.dword[(imm >> 2) & 0x3];
+dst.dword[2] = a.dword[(imm >> 4) & 0x3];
+dst.dword[3] = a.dword[(imm >> 6) & 0x3];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+Permute quad words from a
and b
with indices recorded in imm
and store into dst
.
if ((imm & 0x4) && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.qword[0] = 0;
+} else {
+ dst.qword[0] = (imm & 2) ? a.qword[imm & 0x1] : b.qword[imm & 0x1];
+}
+if ((imm & 0x80) && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.qword[1] = 0;
+} else {
+ dst.qword[1] =
+ (imm & 0x20) ? a.qword[(imm >> 4) & 0x1] : b.qword[(imm >> 4) & 0x1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvperm_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvperm.w xr, xr, xr
+CPU Flags: LASX
+
+Permute words from a
with indices recorded in b
and store into dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[b.word[i] % 0x8];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsll.v xr, xr, imm
+CPU Flags: LASX
+
+Compute whole vector a
shifted left by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+dst.qword[1] = (u128)a.qword[1] << shift;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsrl.v xr, xr, imm
+CPU Flags: LASX
+
+Compute whole vector a
shifted right by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+dst.qword[1] = (u128)a.qword[1] >> shift;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsll_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.b xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsll_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.h xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsll_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.w xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsll_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.d xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.b xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.h xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.w xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.d xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.h.b xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift signed 8-bit elements in a
by imm
to signed 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i + 8] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.hu.bu xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift unsigned 8-bit elements in a
by imm
to unsigned 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i + 8] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.w.h xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift signed 16-bit elements in a
by imm
to signed 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[i + 4] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.wu.hu xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift unsigned 16-bit elements in a
by imm
to unsigned 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[i + 4] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.d.w xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift signed 32-bit elements in a
by imm
to signed 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i + 2] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.du.wu xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift unsigned 32-bit elements in a
by imm
to unsigned 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i + 2] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsra_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.b xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsra_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsra_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsra_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.b xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (s8)((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] =
+ (i < 12) ? (s16)((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] =
+ (i < 6) ? (s32)((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (s8)((s16)b.half[i - 8] >> imm)
+ : (s8)((s16)a.half[i - 16] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? (s16)((s32)b.word[i - 4] >> imm)
+ : (s16)((s32)a.word[i - 8] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+ : (s32)((s64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? (s32)((s64)b.dword[i - 2] >> imm)
+ : (s32)((s64)a.dword[i - 4] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+ : (s64)((s128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? (s64)((s128)b.qword[i - 1] >> imm)
+ : (s64)((s128)a.qword[i - 2] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrar_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.b xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+ (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrar_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+ (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrar_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+ (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrar_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.b xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] =
+ ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] =
+ ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] =
+ ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+ (((s16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u8 shift = (b.half[i - 8] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 8] >> shift) +
+ (((s16)a.half[i - 8] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (s16)(s32)a.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+ (((s32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u8 shift = (b.word[i - 4] & 31);
+ if (shift == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 4] >> shift) +
+ (((s32)a.word[i - 4] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+ (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u8 shift = (b.dword[i - 2] & 63);
+ if (shift == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 2] >> shift) +
+ (((s64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+ (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)b.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)b.half[i - 8] >> imm) +
+ (((s16)b.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 16];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 16] >> imm) +
+ (((s16)a.half[i - 16] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)b.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+ (((s32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+ (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)b.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)b.word[i - 4] >> imm) +
+ (((s32)b.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 8];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 8] >> imm) +
+ (((s32)a.word[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)b.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+ (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)b.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)b.dword[i - 2] >> imm) +
+ (((s64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 4];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 4] >> imm) +
+ (((s64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)b.qword[i];
+ } else {
+ dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+ (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)b.qword[i - 1];
+ } else {
+ dst.dword[i] = (s64)(((s128)b.qword[i - 1] >> imm) +
+ (((s128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)a.qword[i - 2];
+ } else {
+ dst.dword[i] = (s64)(((s128)a.qword[i - 2] >> imm) +
+ (((s128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.b xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.b xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (u8)((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] =
+ (i < 12) ? (u16)((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] =
+ (i < 6) ? (u32)((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (u8)((u16)b.half[i - 8] >> imm)
+ : (u8)((u16)a.half[i - 16] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? (u16)((u32)b.word[i - 4] >> imm)
+ : (u16)((u32)a.word[i - 8] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+ : (u32)((u64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? (u32)((u64)b.dword[i - 2] >> imm)
+ : (u32)((u64)a.dword[i - 4] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+ : (u64)((u128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? (u64)((u128)b.qword[i - 1] >> imm)
+ : (u64)((u128)a.qword[i - 2] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.b xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+ ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+ ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+ ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+ ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.b xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+ (((u16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u8 shift = (b.half[i - 8] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 8] >> shift) +
+ (((u16)a.half[i - 8] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (u16)(u32)a.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+ (((u32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u8 shift = (b.word[i - 4] & 31);
+ if (shift == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 4] >> shift) +
+ (((u32)a.word[i - 4] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+ (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u8 shift = (b.dword[i - 2] & 63);
+ if (shift == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 2] >> shift) +
+ (((u64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+ (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)b.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)b.half[i - 8] >> imm) +
+ (((u16)b.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 16];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 16] >> imm) +
+ (((u16)a.half[i - 16] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)b.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+ (((u32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+ (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)b.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)b.word[i - 4] >> imm) +
+ (((u32)b.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 8];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 8] >> imm) +
+ (((u32)a.word[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)b.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+ (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)b.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)b.dword[i - 2] >> imm) +
+ (((u64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 4];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 4] >> imm) +
+ (((u64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)b.qword[i];
+ } else {
+ dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+ (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)b.qword[i - 1];
+ } else {
+ dst.dword[i] = (u64)(((u128)b.qword[i - 1] >> imm) +
+ (((u128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)a.qword[i - 2];
+ } else {
+ dst.dword[i] = (u64)(((u128)a.qword[i - 2] >> imm) +
+ (((u128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp = (s16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp = (s16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp = (s32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp = (s32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp = (s64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp = (s64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp = (s128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp = (s128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.du.q xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp = (s128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp = (s128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i - 8];
+ } else {
+ temp =
+ ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 16];
+ } else {
+ temp = ((s16)a.half[i - 16] >> imm) +
+ (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i - 8];
+ } else {
+ temp =
+ ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 16];
+ } else {
+ temp = ((s16)a.half[i - 16] >> imm) +
+ (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i - 4];
+ } else {
+ temp =
+ ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 8];
+ } else {
+ temp =
+ ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i - 4];
+ } else {
+ temp =
+ ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 8];
+ } else {
+ temp =
+ ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i - 2];
+ } else {
+ temp = ((s64)b.dword[i - 2] >> imm) +
+ (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 4];
+ } else {
+ temp = ((s64)a.dword[i - 4] >> imm) +
+ (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i - 2];
+ } else {
+ temp = ((s64)b.dword[i - 2] >> imm) +
+ (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 4];
+ } else {
+ temp = ((s64)a.dword[i - 4] >> imm) +
+ (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i - 1];
+ } else {
+ temp = ((s128)b.qword[i - 1] >> imm) +
+ (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 2];
+ } else {
+ temp = ((s128)a.qword[i - 2] >> imm) +
+ (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.du.q xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i - 1];
+ } else {
+ temp = ((s128)b.qword[i - 1] >> imm) +
+ (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 2];
+ } else {
+ temp = ((s128)a.qword[i - 2] >> imm) +
+ (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp = (u16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp = (u16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp = (u32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp = (u32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp = (u64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp = (u64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp = (u128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp = (u128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.du.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp = (u128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp = (u128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i - 8];
+ } else {
+ temp =
+ ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 16];
+ } else {
+ temp = ((u16)a.half[i - 16] >> imm) +
+ (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i - 8];
+ } else {
+ temp =
+ ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 16];
+ } else {
+ temp = ((u16)a.half[i - 16] >> imm) +
+ (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i - 4];
+ } else {
+ temp =
+ ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 8];
+ } else {
+ temp =
+ ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i - 4];
+ } else {
+ temp =
+ ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 8];
+ } else {
+ temp =
+ ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i - 2];
+ } else {
+ temp = ((u64)b.dword[i - 2] >> imm) +
+ (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 4];
+ } else {
+ temp = ((u64)a.dword[i - 4] >> imm) +
+ (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i - 2];
+ } else {
+ temp = ((u64)b.dword[i - 2] >> imm) +
+ (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 4];
+ } else {
+ temp = ((u64)a.dword[i - 4] >> imm) +
+ (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i - 1];
+ } else {
+ temp = ((u128)b.qword[i - 1] >> imm) +
+ (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 2];
+ } else {
+ temp = ((u128)a.qword[i - 2] >> imm) +
+ (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.du.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i - 1];
+ } else {
+ temp = ((u128)b.qword[i - 1] >> imm) +
+ (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 2];
+ } else {
+ temp = ((u128)a.qword[i - 2] >> imm) +
+ (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m256i __lasx_xvrotr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.b xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] =
+ (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvrotr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.h xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+ (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvrotr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.w xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+ (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvrotr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.d xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+ (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.b xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.h xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.w xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.d xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.b xr, xr, xr, xr
+CPU Flags: LASX
+
+Shuffle bytes from a
and b
with indices from c
.
Caveat: the indices are placed in c
, while in other vshuf
intrinsics, they are placed in a
.
__m256i __lasx_xvshuf_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0x1f1f00001a0a1b0b, 0x1111120213031404, 0x0102030405060708, 0x1112131405060708})
+= 0x99997878ee21dd43 0x7777661555144413 0x4321433412341278 0x1234121212341278
+
+for (int i = 0; i < 32; i++) {
+ if ((c.byte[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.byte[i] = 0;
+ } else if ((c.byte[i] % 32) < 16) {
+ dst.byte[i] = b.byte[(c.byte[i] % 32) + ((i >= 16) ? 16 : 0)];
+ } else {
+ dst.byte[i] = a.byte[(c.byte[i] % 32) + ((i >= 16) ? 0 : -16)];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.h xr, xr, xr
+CPU Flags: LASX
+
+Shuffle 16-bit elements in b
and c
with indices from a
, save the result to dst
.
__m256i __lasx_xvshuf_h(__m256i{0x0001000200030004, 0x0005000a000b000c, 0x000f000e00010002, 0x0008000900020001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x1415ef13abcd4321 0x432133441122ff00 0xaabbaabb43211234 0x1234123412344321
+
+for (int i = 0; i < 16; i++) {
+ if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.half[i] = 0;
+ } else if ((a.half[i] % 16) < 8) {
+ dst.half[i] = c.half[(a.half[i] % 16) + ((i >= 8) ? 8 : 0)];
+ } else {
+ dst.half[i] = b.half[(a.half[i] % 16) + ((i >= 8) ? 0 : -8)];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.w xr, xr, xr
+CPU Flags: LASX
+
+Shuffle 32-bit elements in b
and c
with indices from a
, save the result to dst
.
__m256i __lasx_xvshuf_w(__m256i{0x0000000200000004, 0x0000000700000005, 0x0000000100000003, 0x0000000400000000}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x4321432155667788 0x99aabbcc11223344 0x1234123456785678 0x1234123443214321
+
+for (int i = 0; i < 8; i++) {
+ if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.word[i] = 0;
+ } else if ((a.word[i] % 8) < 4) {
+ dst.word[i] = c.word[(a.word[i] % 8) + ((i >= 4) ? 4 : 0)];
+ } else {
+ dst.word[i] = b.word[(a.word[i] % 8) + ((i >= 4) ? 0 : -4)];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.d xr, xr, xr
+CPU Flags: LASX
+
+Shuffle 64-bit elements in b
and c
with indices from a
, save the result to dst
.
__m256i __lasx_xvshuf_d(__m256i{0x0000000000000000, 0x0000000000000003, 0x0000000000000002, 0x0000000000000001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xabcdef1314156678 0x99aabbccddeeff00 0xabcdef1212341234 0x5678567856785678
+
+for (int i = 0; i < 4; i++) {
+ if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.dword[i] = 0;
+ } else if ((a.dword[i] % 4) < 2) {
+ dst.dword[i] = c.dword[(a.dword[i] % 4) + ((i >= 2) ? 2 : 0)];
+ } else {
+ dst.dword[i] = b.dword[(a.dword[i] % 4) + ((i >= 2) ? 0 : -2)];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.b xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 8-bit elements in a
with indices packed in imm
, save the result to dst
.
__m256i __lasx_xvshuf4i_b( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x13ef13cd78667815 0x3412343421432121 0x3412343421432121 0x7856787878567878
+
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.h xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 16-bit elements in a
with indices packed in imm
, save the result to dst
.
__m256i __lasx_xvshuf4i_h( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x667814156678ef13 0x4321432143211234 0x4321432143211234 0x5678567856785678
+
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.w xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 32-bit elements in a
with indices packed in imm
, save the result to dst
.
__m256i __lasx_xvshuf4i_w( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x1415667843214321 0x14156678abcdef13 0x4321432156785678 0x4321432112341234
+
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.d xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 64-bit elements in a
and b
with indices packed in imm
, save the result to dst
.
__m256i __lasx_xvshuf4i_d( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0xabcdef1314156678 0x1122334455667788 0x1234123443214321 0xabcdef1212341234
+
+dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+ (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+dst.dword[2] = (imm & 2) ? b.dword[(imm & 1) + 2] : a.dword[(imm & 1) + 2];
+dst.dword[3] =
+ (imm & 8) ? b.dword[((imm >> 2) & 1) + 2] : a.dword[((imm >> 2) & 1) + 2];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
Latency and throughput (CPI) of each instruction:
+Instruction | 3A6000 | 3C5000 | ||
---|---|---|---|---|
Latency | Throughput (CPI) | Latency | Throughput (CPI) | |
vabsd.b | 2 | 2 | 2 | 2 |
vabsd.bu | 2 | 2 | 2 | 2 |
vabsd.d | 2 | 2 | 2 | 2 |
vabsd.du | 2 | 2 | 2 | 2 |
vabsd.h | 2 | 2 | 2 | 2 |
vabsd.hu | 2 | 2 | 2 | 2 |
vabsd.w | 2 | 2 | 2 | 2 |
vabsd.wu | 2 | 2 | 2 | 2 |
vadd.b | 1 | 4 | 1 | 2 |
vadd.d | 1 | 4 | 1 | 2 |
vadd.h | 1 | 4 | 1 | 2 |
vadd.q | 3 | 2 | 3 | 2 |
vadd.w | 1 | 4 | 1 | 2 |
vadda.b | 3 | 2 | 3 | 2 |
vadda.d | 3 | 2 | 3 | 2 |
vadda.h | 3 | 2 | 3 | 2 |
vadda.w | 3 | 2 | 3 | 2 |
vaddi.bu | 1 | 4 | 1 | 2 |
vaddi.du | 1 | 4 | 1 | 2 |
vaddi.hu | 1 | 4 | 1 | 2 |
vaddi.wu | 1 | 4 | 1 | 2 |
vaddwev.d.w | 2 | 2 | 2 | 2 |
vaddwev.d.wu | 2 | 2 | 2 | 2 |
vaddwev.d.wu.w | 2 | 2 | 2 | 2 |
vaddwev.h.b | 2 | 2 | 2 | 2 |
vaddwev.h.bu | 2 | 2 | 2 | 2 |
vaddwev.h.bu.b | 2 | 2 | 2 | 2 |
vaddwev.q.d | 3 | 2 | 3 | 2 |
vaddwev.q.du | 3 | 2 | 3 | 2 |
vaddwev.q.du.d | 3 | 2 | 3 | 2 |
vaddwev.w.h | 2 | 2 | 2 | 2 |
vaddwev.w.hu | 2 | 2 | 2 | 2 |
vaddwev.w.hu.h | 2 | 2 | 2 | 2 |
vaddwod.d.w | 2 | 2 | 2 | 2 |
vaddwod.d.wu | 2 | 2 | 2 | 2 |
vaddwod.d.wu.w | 2 | 2 | 2 | 2 |
vaddwod.h.b | 2 | 2 | 2 | 2 |
vaddwod.h.bu | 2 | 2 | 2 | 2 |
vaddwod.h.bu.b | 2 | 2 | 2 | 2 |
vaddwod.q.d | 3 | 2 | 3 | 2 |
vaddwod.q.du | 3 | 2 | 3 | 2 |
vaddwod.q.du.d | 3 | 2 | 3 | 2 |
vaddwod.w.h | 2 | 2 | 2 | 2 |
vaddwod.w.hu | 2 | 2 | 2 | 2 |
vaddwod.w.hu.h | 2 | 2 | 2 | 2 |
vand.v | 1 | 4 | 1 | 2 |
vandi.b | 1 | 4 | 1 | 2 |
vandn.v | 1 | 4 | 1 | 2 |
vavg.b | 1 | 4 | 1 | 2 |
vavg.bu | 1 | 4 | 1 | 2 |
vavg.d | 2 | 4 | 2 | 2 |
vavg.du | 2 | 4 | 2 | 2 |
vavg.h | 1 | 4 | 1 | 2 |
vavg.hu | 1 | 4 | 1 | 2 |
vavg.w | 1 | 4 | 1 | 2 |
vavg.wu | 1 | 4 | 1 | 2 |
vavgr.b | 1 | 4 | 1 | 2 |
vavgr.bu | 1 | 4 | 1 | 2 |
vavgr.d | 2 | 4 | 2 | 2 |
vavgr.du | 2 | 4 | 2 | 2 |
vavgr.h | 1 | 4 | 1 | 2 |
vavgr.hu | 1 | 4 | 1 | 2 |
vavgr.w | 1 | 4 | 1 | 2 |
vavgr.wu | 1 | 4 | 1 | 2 |
vbitclr.b | 2 | 2 | 2 | 2 |
vbitclr.d | 2 | 2 | 2 | 2 |
vbitclr.h | 2 | 2 | 2 | 2 |
vbitclr.w | 2 | 2 | 2 | 2 |
vbitclri.b | 2 | 2 | 2 | 2 |
vbitclri.d | 2 | 2 | 2 | 2 |
vbitclri.h | 2 | 2 | 2 | 2 |
vbitclri.w | 2 | 2 | 2 | 2 |
vbitrev.b | 2 | 2 | 2 | 2 |
vbitrev.d | 2 | 2 | 2 | 2 |
vbitrev.h | 2 | 2 | 2 | 2 |
vbitrev.w | 2 | 2 | 2 | 2 |
vbitrevi.b | 2 | 2 | 2 | 2 |
vbitrevi.d | 2 | 2 | 2 | 2 |
vbitrevi.h | 2 | 2 | 2 | 2 |
vbitrevi.w | 2 | 2 | 2 | 2 |
vbitsel.v | 1 | 2 | 1 | 2 |
vbitseli.b | 1 | 2 | 1 | 2 |
vbitset.b | 2 | 2 | 2 | 2 |
vbitset.d | 2 | 2 | 2 | 2 |
vbitset.h | 2 | 2 | 2 | 2 |
vbitset.w | 2 | 2 | 2 | 2 |
vbitseti.b | 2 | 2 | 2 | 2 |
vbitseti.d | 2 | 2 | 2 | 2 |
vbitseti.h | 2 | 2 | 2 | 2 |
vbitseti.w | 2 | 2 | 2 | 2 |
vbsll.v | 1 | 4 | 1 | 2 |
vbsrl.v | 1 | 4 | 1 | 2 |
vclo.b | 2 | 4 | 2 | 2 |
vclo.d | 2 | 4 | 2 | 2 |
vclo.h | 2 | 4 | 2 | 2 |
vclo.w | 2 | 4 | 2 | 2 |
vclz.b | 2 | 4 | 2 | 2 |
vclz.d | 2 | 4 | 2 | 2 |
vclz.h | 2 | 4 | 2 | 2 |
vclz.w | 2 | 4 | 2 | 2 |
vdiv.b | 29, 32 | 0.06(1/15.5) | 29, 32 | 0.06(1/17) |
vdiv.bu | 29, 33 | 0.06(1/16.5) | 29, 36 | 0.06(1/18) |
vdiv.d | 8 | 0.25(1/4) | 8, 18.5 | 0.11(1/9) |
vdiv.du | 8 | 0.25(1/4) | 8, 18.5 | 0.11(1/9) |
vdiv.h | 17 | 0.12(1/8.5) | 17, 21.5 | 0.09(1/11) |
vdiv.hu | 17, 22 | 0.11(1/9) | 17, 21.5 | 0.07(1/14) |
vdiv.w | 11 | 0.18(1/5.5) | 11, 17.5 | 0.09(1/11.5) |
vdiv.wu | 11 | 0.18(1/5.5) | 11, 17.5 | 0.07(1/15) |
vext2xv.d.b | 3 | 4 | 3 | 2 |
vext2xv.d.h | 3 | 4 | 3 | 2 |
vext2xv.d.w | 3 | 4 | 3 | 2 |
vext2xv.du.bu | 3 | 4 | 3 | 2 |
vext2xv.du.hu | 3 | 4 | 3 | 2 |
vext2xv.du.wu | 3 | 4 | 3 | 2 |
vext2xv.h.b | 3 | 4 | 3 | 2 |
vext2xv.hu.bu | 3 | 4 | 3 | 2 |
vext2xv.w.b | 3 | 4 | 3 | 2 |
vext2xv.w.h | 3 | 4 | 3 | 2 |
vext2xv.wu.bu | 3 | 4 | 3 | 2 |
vext2xv.wu.hu | 3 | 4 | 3 | 2 |
vexth.d.w | 1 | 4 | 1 | 2 |
vexth.du.wu | 1 | 4 | 1 | 2 |
vexth.h.b | 1 | 4 | 1 | 2 |
vexth.hu.bu | 1 | 4 | 1 | 2 |
vexth.q.d | 1 | 4 | 1 | 2 |
vexth.qu.du | 1 | 4 | 1 | 2 |
vexth.w.h | 1 | 4 | 1 | 2 |
vexth.wu.hu | 1 | 4 | 1 | 2 |
vextl.q.d | 1 | 4 | 1 | 2 |
vextl.qu.du | 1 | 4 | 1 | 2 |
vextrins.b | 1 | 4 | 1 | 2 |
vextrins.d | 1 | 4 | 1 | 2 |
vextrins.h | 1 | 4 | 1 | 2 |
vextrins.w | 1 | 4 | 1 | 2 |
vfadd.d | 3 | 4 | 5 | 2 |
vfadd.s | 3 | 4 | 5 | 2 |
vfclass.d | 2 | 4 | 2 | 2 |
vfclass.s | 2 | 4 | 2 | 2 |
vfcmp.caf.d | 2 | 4 | 2 | 2 |
vfcmp.caf.s | 2 | 4 | 2 | 2 |
vfcmp.ceq.d | 2 | 4 | 2 | 2 |
vfcmp.ceq.s | 2 | 4 | 2 | 2 |
vfcmp.cle.d | 2 | 4 | 2 | 2 |
vfcmp.cle.s | 2 | 4 | 2 | 2 |
vfcmp.clt.d | 2 | 4 | 2 | 2 |
vfcmp.clt.s | 2 | 4 | 2 | 2 |
vfcmp.cne.d | 2 | 4 | 2 | 2 |
vfcmp.cne.s | 2 | 4 | 2 | 2 |
vfcmp.cor.d | 2 | 4 | 2 | 2 |
vfcmp.cor.s | 2 | 4 | 2 | 2 |
vfcmp.cueq.d | 2 | 4 | 2 | 2 |
vfcmp.cueq.s | 2 | 4 | 2 | 2 |
vfcmp.cule.d | 2 | 4 | 2 | 2 |
vfcmp.cule.s | 2 | 4 | 2 | 2 |
vfcmp.cult.d | 2 | 4 | 2 | 2 |
vfcmp.cult.s | 2 | 4 | 2 | 2 |
vfcmp.cun.d | 2 | 4 | 2 | 2 |
vfcmp.cun.s | 2 | 4 | 2 | 2 |
vfcmp.cune.d | 2 | 4 | 2 | 2 |
vfcmp.cune.s | 2 | 4 | 2 | 2 |
vfcmp.saf.d | 2 | 4 | 2 | 2 |
vfcmp.saf.s | 2 | 4 | 2 | 2 |
vfcmp.seq.d | 2 | 4 | 2 | 2 |
vfcmp.seq.s | 2 | 4 | 2 | 2 |
vfcmp.sle.d | 2 | 4 | 2 | 2 |
vfcmp.sle.s | 2 | 4 | 2 | 2 |
vfcmp.slt.d | 2 | 4 | 2 | 2 |
vfcmp.slt.s | 2 | 4 | 2 | 2 |
vfcmp.sne.d | 2 | 4 | 2 | 2 |
vfcmp.sne.s | 2 | 4 | 2 | 2 |
vfcmp.sor.d | 2 | 4 | 2 | 2 |
vfcmp.sor.s | 2 | 4 | 2 | 2 |
vfcmp.sueq.d | 2 | 4 | 2 | 2 |
vfcmp.sueq.s | 2 | 4 | 2 | 2 |
vfcmp.sule.d | 2 | 4 | 2 | 2 |
vfcmp.sule.s | 2 | 4 | 2 | 2 |
vfcmp.sult.d | 2 | 4 | 2 | 2 |
vfcmp.sult.s | 2 | 4 | 2 | 2 |
vfcmp.sun.d | 2 | 4 | 2 | 2 |
vfcmp.sun.s | 2 | 4 | 2 | 2 |
vfcmp.sune.d | 2 | 4 | 2 | 2 |
vfcmp.sune.s | 2 | 4 | 2 | 2 |
vfcvt.h.s | 3 | 2 | 3 | 1 |
vfcvt.s.d | 3 | 2 | 3 | 1 |
vfcvth.d.s | 3 | 2 | 3 | 1 |
vfcvth.s.h | 3 | 2 | 3 | 1 |
vfcvtl.d.s | 3 | 2 | 3 | 1 |
vfcvtl.s.h | 3 | 2 | 3 | 1 |
vfdiv.d | 8, 21.5 | 0.25(1/4) | 8, 16.5 | 0.08(1/12.5) |
vfdiv.s | 11 | 0.18(1/5.5) | 11, 19.5 | 0.13(1/7.5) |
vffint.d.l | 4 | 4 | 4 | 2 |
vffint.d.lu | 4 | 4 | 4 | 2 |
vffint.s.l | 5 | 2 | 5 | 1 |
vffint.s.w | 4 | 4 | 4 | 2 |
vffint.s.wu | 4 | 4 | 4 | 2 |
vffinth.d.w | 5 | 2 | 5 | 1 |
vffintl.d.w | 5 | 2 | 5 | 1 |
vflogb.d | 4 | 4 | 4 | 2 |
vflogb.s | 4 | 4 | 4 | 2 |
vfmadd.d | 5 | 2 | 5 | 2 |
vfmadd.s | 5 | 2 | 5 | 2 |
vfmax.d | 2 | 4 | 2 | 2 |
vfmax.s | 2 | 4 | 2 | 2 |
vfmaxa.d | 2 | 4 | 2 | 2 |
vfmaxa.s | 2 | 4 | 2 | 2 |
vfmin.d | 2 | 4 | 2 | 2 |
vfmin.s | 2 | 4 | 2 | 2 |
vfmina.d | 2 | 4 | 2 | 2 |
vfmina.s | 2 | 4 | 2 | 2 |
vfmsub.d | 5 | 2 | 5 | 2 |
vfmsub.s | 5 | 2 | 5 | 2 |
vfmul.d | 5 | 2 | 5 | 2 |
vfmul.s | 5 | 2 | 5 | 2 |
vfnmadd.d | 5 | 2 | 5 | 2 |
vfnmadd.s | 5 | 2 | 5 | 2 |
vfnmsub.d | 5 | 2 | 5 | 2 |
vfnmsub.s | 5 | 2 | 5 | 2 |
vfrecip.d | 8 | 0.25(1/4) | 23 | 0.08(1/12) |
vfrecip.s | 11 | 0.18(1/5.5) | 27 | 0.14(1/7) |
vfrint.d | 4 | 2 | 4 | 2 |
vfrint.s | 4 | 2 | 4 | 2 |
vfrintrm.d | 4 | 2 | 4 | 2 |
vfrintrm.s | 4 | 2 | 4 | 2 |
vfrintrne.d | 4 | 2 | 4 | 2 |
vfrintrne.s | 4 | 2 | 4 | 2 |
vfrintrp.d | 4 | 2 | 4 | 2 |
vfrintrp.s | 4 | 2 | 4 | 2 |
vfrintrz.d | 4 | 2 | 4 | 2 |
vfrintrz.s | 4 | 2 | 4 | 2 |
vfrsqrt.d | 15 | 0.04(1/26.5) | 15 | 0.04(1/27.5) |
vfrsqrt.s | 17 | 0.05(1/19) | 21 | 0.11(1/9) |
vfrstp.b | 2 | 2 | 2 | 2 |
vfrstp.h | 2 | 2 | 2 | 2 |
vfrstpi.b | 2 | 2 | 2 | 2 |
vfrstpi.h | 2 | 2 | 2 | 2 |
vfsqrt.d | 36 | 0.06(1/17.5) | 36 | 0.05(1/18.5) |
vfsqrt.s | 11 | 0.08(1/12) | 27 | 0.17(1/6) |
vfsub.d | 3 | 4 | 5 | 2 |
vfsub.s | 3 | 4 | 5 | 2 |
vftint.l.d | 4 | 4 | 4 | 2 |
vftint.lu.d | 4 | 4 | 4 | 2 |
vftint.w.d | 5 | 2 | 5 | 1 |
vftint.w.s | 4 | 4 | 4 | 2 |
vftint.wu.s | 4 | 4 | 4 | 2 |
vftinth.l.s | 5 | 2 | 5 | 1 |
vftintl.l.s | 5 | 2 | 5 | 1 |
vftintrm.l.d | 4 | 4 | 4 | 2 |
vftintrm.w.d | 5 | 2 | 5 | 1 |
vftintrm.w.s | 4 | 4 | 4 | 2 |
vftintrmh.l.s | 5 | 2 | 5 | 1 |
vftintrml.l.s | 5 | 2 | 5 | 1 |
vftintrne.l.d | 4 | 4 | 4 | 2 |
vftintrne.w.d | 5 | 2 | 5 | 1 |
vftintrne.w.s | 4 | 4 | 4 | 2 |
vftintrneh.l.s | 5 | 2 | 5 | 1 |
vftintrnel.l.s | 5 | 2 | 5 | 1 |
vftintrp.l.d | 4 | 4 | 4 | 2 |
vftintrp.w.d | 5 | 2 | 5 | 1 |
vftintrp.w.s | 4 | 4 | 4 | 2 |
vftintrph.l.s | 5 | 2 | 5 | 1 |
vftintrpl.l.s | 5 | 2 | 5 | 1 |
vftintrz.l.d | 4 | 4 | 4 | 2 |
vftintrz.lu.d | 4 | 4 | 4 | 2 |
vftintrz.w.d | 5 | 2 | 5 | 1 |
vftintrz.w.s | 4 | 4 | 4 | 2 |
vftintrz.wu.s | 4 | 4 | 4 | 2 |
vftintrzh.l.s | 5 | 2 | 5 | 1 |
vftintrzl.l.s | 5 | 2 | 5 | 1 |
vhaddw.d.w | 2 | 2 | 2 | 2 |
vhaddw.du.wu | 2 | 2 | 2 | 2 |
vhaddw.h.b | 2 | 2 | 2 | 2 |
vhaddw.hu.bu | 2 | 2 | 2 | 2 |
vhaddw.q.d | 3 | 2 | 3 | 2 |
vhaddw.qu.du | 3 | 2 | 3 | 2 |
vhaddw.w.h | 2 | 2 | 2 | 2 |
vhaddw.wu.hu | 2 | 2 | 2 | 2 |
vhsubw.d.w | 2 | 2 | 2 | 2 |
vhsubw.du.wu | 2 | 2 | 2 | 2 |
vhsubw.h.b | 2 | 2 | 2 | 2 |
vhsubw.hu.bu | 2 | 2 | 2 | 2 |
vhsubw.q.d | 3 | 2 | 3 | 2 |
vhsubw.qu.du | 3 | 2 | 3 | 2 |
vhsubw.w.h | 2 | 2 | 2 | 2 |
vhsubw.wu.hu | 2 | 2 | 2 | 2 |
vilvh.b | 1 | 4 | 1 | 2 |
vilvh.d | 1 | 4 | 1 | 2 |
vilvh.h | 1 | 4 | 1 | 2 |
vilvh.w | 1 | 4 | 1 | 2 |
vilvl.b | 1 | 4 | 1 | 2 |
vilvl.d | 1 | 4 | 1 | 2 |
vilvl.h | 1 | 4 | 1 | 2 |
vilvl.w | 1 | 4 | 1 | 2 |
vinsgr2vr.b | 1 | 1 | 1 | 1 |
vinsgr2vr.d | 1 | 1 | 1 | 1 |
vinsgr2vr.h | 1 | 1 | 1 | 1 |
vinsgr2vr.w | 1 | 1 | 1 | 1 |
vmadd.b | 4 | 2 | 4 | 2 |
vmadd.d | 4 | 2 | 4 | 2 |
vmadd.h | 4 | 2 | 4 | 2 |
vmadd.w | 4 | 2 | 4 | 2 |
vmaddwev.d.w | 4 | 2 | 4 | 2 |
vmaddwev.d.wu | 4 | 2 | 4 | 2 |
vmaddwev.d.wu.w | 4 | 2 | 4 | 2 |
vmaddwev.h.b | 4 | 2 | 4 | 2 |
vmaddwev.h.bu | 4 | 2 | 4 | 2 |
vmaddwev.h.bu.b | 4 | 2 | 4 | 2 |
vmaddwev.q.d | 7 | 1.14 | 7 | 1.14 |
vmaddwev.q.du | 7 | 1.14 | 7 | 1.14 |
vmaddwev.q.du.d | 7 | 1.14 | 7 | 1.14 |
vmaddwev.w.h | 4 | 2 | 4 | 2 |
vmaddwev.w.hu | 4 | 2 | 4 | 2 |
vmaddwev.w.hu.h | 4 | 2 | 4 | 2 |
vmaddwod.d.w | 4 | 2 | 4 | 2 |
vmaddwod.d.wu | 4 | 2 | 4 | 2 |
vmaddwod.d.wu.w | 4 | 2 | 4 | 2 |
vmaddwod.h.b | 4 | 2 | 4 | 2 |
vmaddwod.h.bu | 4 | 2 | 4 | 2 |
vmaddwod.h.bu.b | 4 | 2 | 4 | 2 |
vmaddwod.q.d | 7 | 1.14 | 7 | 1.14 |
vmaddwod.q.du | 7 | 1.14 | 7 | 1.14 |
vmaddwod.q.du.d | 7 | 1.14 | 7 | 1.14 |
vmaddwod.w.h | 4 | 2 | 4 | 2 |
vmaddwod.w.hu | 4 | 2 | 4 | 2 |
vmaddwod.w.hu.h | 4 | 2 | 4 | 2 |
vmax.b | 1 | 4 | 1 | 2 |
vmax.bu | 1 | 4 | 1 | 2 |
vmax.d | 2 | 4 | 2 | 2 |
vmax.du | 2 | 4 | 2 | 2 |
vmax.h | 1 | 4 | 1 | 2 |
vmax.hu | 1 | 4 | 1 | 2 |
vmax.w | 1 | 4 | 1 | 2 |
vmax.wu | 1 | 4 | 1 | 2 |
vmaxi.b | 1 | 4 | 1 | 2 |
vmaxi.bu | 1 | 4 | 1 | 2 |
vmaxi.d | 2 | 4 | 2 | 2 |
vmaxi.du | 2 | 4 | 2 | 2 |
vmaxi.h | 1 | 4 | 1 | 2 |
vmaxi.hu | 1 | 4 | 1 | 2 |
vmaxi.w | 1 | 4 | 1 | 2 |
vmaxi.wu | 1 | 4 | 1 | 2 |
vmin.b | 1 | 4 | 1 | 2 |
vmin.bu | 1 | 4 | 1 | 2 |
vmin.d | 2 | 4 | 2 | 2 |
vmin.du | 2 | 4 | 2 | 2 |
vmin.h | 1 | 4 | 1 | 2 |
vmin.hu | 1 | 4 | 1 | 2 |
vmin.w | 1 | 4 | 1 | 2 |
vmin.wu | 1 | 4 | 1 | 2 |
vmini.b | 1 | 4 | 1 | 2 |
vmini.bu | 1 | 4 | 1 | 2 |
vmini.d | 2 | 4 | 2 | 2 |
vmini.du | 2 | 4 | 2 | 2 |
vmini.h | 1 | 4 | 1 | 2 |
vmini.hu | 1 | 4 | 1 | 2 |
vmini.w | 1 | 4 | 1 | 2 |
vmini.wu | 1 | 4 | 1 | 2 |
vmod.b | 29, 35 | 0.06(1/15.5) | 29, 33 | 0.06(1/17) |
vmod.bu | 29, 37 | 0.06(1/17.5) | 29, 33 | 0.05(1/19) |
vmod.d | 8, 10 | 0.25(1/4) | 8, 10 | 0.11(1/9.5) |
vmod.du | 8, 10 | 0.25(1/4) | 8, 10 | 0.11(1/9.5) |
vmod.h | 17, 21 | 0.12(1/8.5) | 17, 21 | 0.09(1/11) |
vmod.hu | 17, 21 | 0.11(1/9.5) | 17, 21 | 0.07(1/15) |
vmod.w | 11, 13 | 0.18(1/5.5) | 11, 15 | 0.08(1/12) |
vmod.wu | 11, 13 | 0.18(1/5.5) | 11, 15 | 0.06(1/16) |
vmskgez.b | 1 | 4 | 1 | 2 |
vmskltz.b | 1 | 4 | 1 | 2 |
vmskltz.d | 1 | 4 | 1 | 2 |
vmskltz.h | 1 | 4 | 1 | 2 |
vmskltz.w | 1 | 4 | 1 | 2 |
vmsknz.b | 1 | 4 | 1 | 2 |
vmsub.b | 4 | 2 | 4 | 2 |
vmsub.d | 4 | 2 | 4 | 2 |
vmsub.h | 4 | 2 | 4 | 2 |
vmsub.w | 4 | 2 | 4 | 2 |
vmuh.b | 4 | 2 | 4 | 2 |
vmuh.bu | 4 | 2 | 4 | 2 |
vmuh.d | 4 | 2 | 4 | 2 |
vmuh.du | 4 | 2 | 4 | 2 |
vmuh.h | 4 | 2 | 4 | 2 |
vmuh.hu | 4 | 2 | 4 | 2 |
vmuh.w | 4 | 2 | 4 | 2 |
vmuh.wu | 4 | 2 | 4 | 2 |
vmul.b | 4 | 2 | 4 | 2 |
vmul.d | 4 | 2 | 4 | 2 |
vmul.h | 4 | 2 | 4 | 2 |
vmul.w | 4 | 2 | 4 | 2 |
vmulwev.d.w | 4 | 2 | 4 | 2 |
vmulwev.d.wu | 4 | 2 | 4 | 2 |
vmulwev.d.wu.w | 4 | 2 | 4 | 2 |
vmulwev.h.b | 4 | 2 | 4 | 2 |
vmulwev.h.bu | 4 | 2 | 4 | 2 |
vmulwev.h.bu.b | 4 | 2 | 4 | 2 |
vmulwev.q.d | 7 | 2 | 7 | 2 |
vmulwev.q.du | 7 | 2 | 7 | 2 |
vmulwev.q.du.d | 7 | 2 | 7 | 2 |
vmulwev.w.h | 4 | 2 | 4 | 2 |
vmulwev.w.hu | 4 | 2 | 4 | 2 |
vmulwev.w.hu.h | 4 | 2 | 4 | 2 |
vmulwod.d.w | 4 | 2 | 4 | 2 |
vmulwod.d.wu | 4 | 2 | 4 | 2 |
vmulwod.d.wu.w | 4 | 2 | 4 | 2 |
vmulwod.h.b | 4 | 2 | 4 | 2 |
vmulwod.h.bu | 4 | 2 | 4 | 2 |
vmulwod.h.bu.b | 4 | 2 | 4 | 2 |
vmulwod.q.d | 7 | 2 | 7 | 2 |
vmulwod.q.du | 7 | 2 | 7 | 2 |
vmulwod.q.du.d | 7 | 2 | 7 | 2 |
vmulwod.w.h | 4 | 2 | 4 | 2 |
vmulwod.w.hu | 4 | 2 | 4 | 2 |
vmulwod.w.hu.h | 4 | 2 | 4 | 2 |
vneg.b | 1 | 4 | 1 | 2 |
vneg.d | 1 | 4 | 1 | 2 |
vneg.h | 1 | 4 | 1 | 2 |
vneg.w | 1 | 4 | 1 | 2 |
vnor.v | 1 | 4 | 1 | 2 |
vnori.b | 1 | 4 | 1 | 2 |
vor.v | 1 | 4 | 1 | 2 |
vori.b | 1 | 4 | 1 | 2 |
vorn.v | 1 | 4 | 1 | 2 |
vpackev.b | 1 | 4 | 1 | 2 |
vpackev.d | 1 | 4 | 1 | 2 |
vpackev.h | 1 | 4 | 1 | 2 |
vpackev.w | 1 | 4 | 1 | 2 |
vpackod.b | 1 | 4 | 1 | 2 |
vpackod.d | 1 | 4 | 1 | 2 |
vpackod.h | 1 | 4 | 1 | 2 |
vpackod.w | 1 | 4 | 1 | 2 |
vpcnt.b | 2 | 2 | 2 | 2 |
vpcnt.d | 2 | 2 | 2 | 2 |
vpcnt.h | 2 | 2 | 2 | 2 |
vpcnt.w | 2 | 2 | 2 | 2 |
vpermi.w | 1 | 4 | 1 | 2 |
vpickev.b | 1 | 4 | 1 | 2 |
vpickev.d | 1 | 4 | 1 | 2 |
vpickev.h | 1 | 4 | 1 | 2 |
vpickev.w | 1 | 4 | 1 | 2 |
vpickod.b | 1 | 4 | 1 | 2 |
vpickod.d | 1 | 4 | 1 | 2 |
vpickod.h | 1 | 4 | 1 | 2 |
vpickod.w | 1 | 4 | 1 | 2 |
vpickve2gr.b | 1 | 1 | 1 | 1 |
vpickve2gr.bu | 1 | 1 | 1 | 1 |
vpickve2gr.d | 1 | 1 | 1 | 1 |
vpickve2gr.du | 1 | 1 | 1 | 1 |
vpickve2gr.h | 1 | 1 | 1 | 1 |
vpickve2gr.hu | 1 | 1 | 1 | 1 |
vpickve2gr.w | 1 | 1 | 1 | 1 |
vpickve2gr.wu | 1 | 1 | 1 | 1 |
vreplgr2vr.b | N/A | 1 | N/A | 1 |
vreplgr2vr.d | N/A | 1 | N/A | 1 |
vreplgr2vr.h | N/A | 1 | N/A | 1 |
vreplgr2vr.w | N/A | 1 | N/A | 1 |
vrepli.b | N/A | 6 | N/A | 2 |
vrepli.d | N/A | 4 | N/A | 2 |
vrepli.h | N/A | 4 | N/A | 2 |
vrepli.w | N/A | 4 | N/A | 2 |
vreplve.b | 1 | 1 | 1 | 1 |
vreplve.d | 1 | 1 | 1 | 1 |
vreplve.h | 1 | 1 | 1 | 1 |
vreplve.w | 1 | 1 | 1 | 1 |
vreplvei.b | 1 | 4 | 1 | 2 |
vreplvei.d | 1 | 4 | 1 | 2 |
vreplvei.h | 1 | 4 | 1 | 2 |
vreplvei.w | 1 | 4 | 1 | 2 |
vrotr.b | 1 | 4 | 2 | 2 |
vrotr.d | 1 | 4 | 2 | 2 |
vrotr.h | 1 | 4 | 2 | 2 |
vrotr.w | 1 | 4 | 2 | 2 |
vrotri.b | 1 | 4 | 2 | 2 |
vrotri.d | 1 | 4 | 2 | 2 |
vrotri.h | 1 | 4 | 2 | 2 |
vrotri.w | 1 | 4 | 2 | 2 |
vsadd.b | 1 | 4 | 1 | 2 |
vsadd.bu | 1 | 4 | 1 | 2 |
vsadd.d | 1 | 4 | 1 | 2 |
vsadd.du | 1 | 4 | 1 | 2 |
vsadd.h | 1 | 4 | 1 | 2 |
vsadd.hu | 1 | 4 | 1 | 2 |
vsadd.w | 1 | 4 | 1 | 2 |
vsadd.wu | 1 | 4 | 1 | 2 |
vsat.b | 2 | 2 | 2 | 2 |
vsat.bu | 2 | 2 | 2 | 2 |
vsat.d | 2 | 2 | 2 | 2 |
vsat.du | 2 | 2 | 2 | 2 |
vsat.h | 2 | 2 | 2 | 2 |
vsat.hu | 2 | 2 | 2 | 2 |
vsat.w | 2 | 2 | 2 | 2 |
vsat.wu | 2 | 2 | 2 | 2 |
vseq.b | 1 | 4 | 1 | 2 |
vseq.d | 1 | 4 | 1 | 2 |
vseq.h | 1 | 4 | 1 | 2 |
vseq.w | 1 | 4 | 1 | 2 |
vseqi.b | 1 | 4 | 1 | 2 |
vseqi.d | 1 | 4 | 1 | 2 |
vseqi.h | 1 | 4 | 1 | 2 |
vseqi.w | 1 | 4 | 1 | 2 |
vsetallnez.b | N/A | 2 | N/A | 2 |
vsetallnez.d | N/A | 2 | N/A | 2 |
vsetallnez.h | N/A | 2 | N/A | 2 |
vsetallnez.w | N/A | 2 | N/A | 2 |
vsetanyeqz.b | N/A | 2 | N/A | 2 |
vsetanyeqz.d | N/A | 2 | N/A | 2 |
vsetanyeqz.h | N/A | 2 | N/A | 2 |
vsetanyeqz.w | N/A | 2 | N/A | 2 |
vseteqz.v | N/A | 2 | N/A | 2 |
vsetnez.v | N/A | 2 | N/A | 2 |
vshuf4i.b | 1 | 4 | 1 | 2 |
vshuf4i.d | 1 | 4 | 1 | 2 |
vshuf4i.h | 1 | 4 | 1 | 2 |
vshuf4i.w | 1 | 4 | 1 | 2 |
vshuf.b | 1 | 2 | 1 | 2 |
vshuf.d | 1 | 2 | 1 | 2 |
vshuf.h | 1 | 2 | 1 | 2 |
vshuf.w | 1 | 2 | 1 | 2 |
vsigncov.b | 1 | 2 | 1 | 2 |
vsigncov.d | 1 | 2 | 1 | 2 |
vsigncov.h | 1 | 2 | 1 | 2 |
vsigncov.w | 1 | 2 | 1 | 2 |
vsle.b | 1 | 4 | 1 | 2 |
vsle.bu | 1 | 4 | 1 | 2 |
vsle.d | 2 | 4 | 2 | 2 |
vsle.du | 2 | 4 | 2 | 2 |
vsle.h | 1 | 4 | 1 | 2 |
vsle.hu | 1 | 4 | 1 | 2 |
vsle.w | 1 | 4 | 1 | 2 |
vsle.wu | 1 | 4 | 1 | 2 |
vslei.b | 1 | 4 | 1 | 2 |
vslei.bu | 1 | 4 | 1 | 2 |
vslei.d | 2 | 4 | 2 | 2 |
vslei.du | 2 | 4 | 2 | 2 |
vslei.h | 1 | 4 | 1 | 2 |
vslei.hu | 1 | 4 | 1 | 2 |
vslei.w | 1 | 4 | 1 | 2 |
vslei.wu | 1 | 4 | 1 | 2 |
vsll.b | 1 | 4 | 1 | 2 |
vsll.d | 1 | 4 | 1 | 2 |
vsll.h | 1 | 4 | 1 | 2 |
vsll.w | 1 | 4 | 1 | 2 |
vslli.b | 1 | 4 | 1 | 2 |
vslli.d | 1 | 4 | 1 | 2 |
vslli.h | 1 | 4 | 1 | 2 |
vslli.w | 1 | 4 | 1 | 2 |
vsllwil.d.w | 2 | 2 | 2 | 1 |
vsllwil.du.wu | 2 | 2 | 2 | 1 |
vsllwil.h.b | 2 | 2 | 2 | 1 |
vsllwil.hu.bu | 2 | 2 | 2 | 1 |
vsllwil.w.h | 2 | 2 | 2 | 1 |
vsllwil.wu.hu | 2 | 2 | 2 | 1 |
vslt.b | 1 | 4 | 1 | 2 |
vslt.bu | 1 | 4 | 1 | 2 |
vslt.d | 2 | 4 | 2 | 2 |
vslt.du | 2 | 4 | 2 | 2 |
vslt.h | 1 | 4 | 1 | 2 |
vslt.hu | 1 | 4 | 1 | 2 |
vslt.w | 1 | 4 | 1 | 2 |
vslt.wu | 1 | 4 | 1 | 2 |
vslti.b | 1 | 4 | 1 | 2 |
vslti.bu | 1 | 4 | 1 | 2 |
vslti.d | 2 | 4 | 2 | 2 |
vslti.du | 2 | 4 | 2 | 2 |
vslti.h | 1 | 4 | 1 | 2 |
vslti.hu | 1 | 4 | 1 | 2 |
vslti.w | 1 | 4 | 1 | 2 |
vslti.wu | 1 | 4 | 1 | 2 |
vsra.b | 1 | 4 | 1 | 2 |
vsra.d | 1 | 4 | 1 | 2 |
vsra.h | 1 | 4 | 1 | 2 |
vsra.w | 1 | 4 | 1 | 2 |
vsrai.b | 1 | 4 | 1 | 2 |
vsrai.d | 1 | 4 | 1 | 2 |
vsrai.h | 1 | 4 | 1 | 2 |
vsrai.w | 1 | 4 | 1 | 2 |
vsran.b.h | 2 | 2 | 2 | 1 |
vsran.h.w | 2 | 2 | 2 | 1 |
vsran.w.d | 2 | 2 | 2 | 1 |
vsrani.b.h | 4 | 2 | 4 | 1 |
vsrani.d.q | 3 | 2 | 3 | 2 |
vsrani.h.w | 4 | 2 | 4 | 1 |
vsrani.w.d | 4 | 2 | 4 | 1 |
vsrar.b | 3 | 2 | 3 | 2 |
vsrar.d | 3 | 2 | 3 | 2 |
vsrar.h | 3 | 2 | 3 | 2 |
vsrar.w | 3 | 2 | 3 | 2 |
vsrari.b | 3 | 2 | 3 | 2 |
vsrari.d | 3 | 2 | 3 | 2 |
vsrari.h | 3 | 2 | 3 | 2 |
vsrari.w | 3 | 2 | 3 | 2 |
vsrarn.b.h | 4 | 2 | 4 | 1 |
vsrarn.h.w | 4 | 2 | 4 | 1 |
vsrarn.w.d | 4 | 2 | 4 | 1 |
vsrarni.b.h | 4 | 2 | 4 | 1 |
vsrarni.d.q | 3 | 2 | 3 | 2 |
vsrarni.h.w | 4 | 2 | 4 | 1 |
vsrarni.w.d | 4 | 2 | 4 | 1 |
vsrl.b | 1 | 4 | 1 | 2 |
vsrl.d | 1 | 4 | 1 | 2 |
vsrl.h | 1 | 4 | 1 | 2 |
vsrl.w | 1 | 4 | 1 | 2 |
vsrli.b | 1 | 4 | 1 | 2 |
vsrli.d | 1 | 4 | 1 | 2 |
vsrli.h | 1 | 4 | 1 | 2 |
vsrli.w | 1 | 4 | 1 | 2 |
vsrln.b.h | 2 | 2 | 2 | 1 |
vsrln.h.w | 2 | 2 | 2 | 1 |
vsrln.w.d | 2 | 2 | 2 | 1 |
vsrlni.b.h | 4 | 2 | 4 | 1 |
vsrlni.d.q | 3 | 2 | 3 | 2 |
vsrlni.h.w | 4 | 2 | 4 | 1 |
vsrlni.w.d | 4 | 2 | 4 | 1 |
vsrlr.b | 3 | 2 | 3 | 2 |
vsrlr.d | 3 | 2 | 3 | 2 |
vsrlr.h | 3 | 2 | 3 | 2 |
vsrlr.w | 3 | 2 | 3 | 2 |
vsrlri.b | 3 | 2 | 3 | 2 |
vsrlri.d | 3 | 2 | 3 | 2 |
vsrlri.h | 3 | 2 | 3 | 2 |
vsrlri.w | 3 | 2 | 3 | 2 |
vsrlrn.b.h | 4 | 2 | 4 | 1 |
vsrlrn.h.w | 4 | 2 | 4 | 1 |
vsrlrn.w.d | 4 | 2 | 4 | 1 |
vsrlrni.b.h | 4 | 2 | 4 | 1 |
vsrlrni.d.q | 3 | 2 | 3 | 2 |
vsrlrni.h.w | 4 | 2 | 4 | 1 |
vsrlrni.w.d | 4 | 2 | 4 | 1 |
vssran.b.h | 4 | 2 | 4 | 1 |
vssran.bu.h | 4 | 2 | 4 | 1 |
vssran.h.w | 4 | 2 | 4 | 1 |
vssran.hu.w | 4 | 2 | 4 | 1 |
vssran.w.d | 4 | 2 | 4 | 1 |
vssran.wu.d | 4 | 2 | 4 | 1 |
vssrani.b.h | 4 | 2 | 4 | 1 |
vssrani.bu.h | 4 | 2 | 4 | 1 |
vssrani.d.q | 3 | 2 | 3 | 2 |
vssrani.du.q | 3 | 2 | 3 | 2 |
vssrani.h.w | 4 | 2 | 4 | 1 |
vssrani.hu.w | 4 | 2 | 4 | 1 |
vssrani.w.d | 4 | 2 | 4 | 1 |
vssrani.wu.d | 4 | 2 | 4 | 1 |
vssrarn.b.h | 4 | 2 | 4 | 1 |
vssrarn.bu.h | 4 | 2 | 4 | 1 |
vssrarn.h.w | 4 | 2 | 4 | 1 |
vssrarn.hu.w | 4 | 2 | 4 | 1 |
vssrarn.w.d | 4 | 2 | 4 | 1 |
vssrarn.wu.d | 4 | 2 | 4 | 1 |
vssrarni.b.h | 4 | 2 | 4 | 1 |
vssrarni.bu.h | 4 | 2 | 4 | 1 |
vssrarni.d.q | 3 | 2 | 3 | 2 |
vssrarni.du.q | 3 | 2 | 3 | 2 |
vssrarni.h.w | 4 | 2 | 4 | 1 |
vssrarni.hu.w | 4 | 2 | 4 | 1 |
vssrarni.w.d | 4 | 2 | 4 | 1 |
vssrarni.wu.d | 4 | 2 | 4 | 1 |
vssrln.b.h | 4 | 2 | 4 | 1 |
vssrln.bu.h | 4 | 2 | 4 | 1 |
vssrln.h.w | 4 | 2 | 4 | 1 |
vssrln.hu.w | 4 | 2 | 4 | 1 |
vssrln.w.d | 4 | 2 | 4 | 1 |
vssrln.wu.d | 4 | 2 | 4 | 1 |
vssrlni.b.h | 4 | 2 | 4 | 1 |
vssrlni.bu.h | 4 | 2 | 4 | 1 |
vssrlni.d.q | 3 | 2 | 3 | 2 |
vssrlni.du.q | 3 | 2 | 3 | 2 |
vssrlni.h.w | 4 | 2 | 4 | 1 |
vssrlni.hu.w | 4 | 2 | 4 | 1 |
vssrlni.w.d | 4 | 2 | 4 | 1 |
vssrlni.wu.d | 4 | 2 | 4 | 1 |
vssrlrn.b.h | 4 | 2 | 4 | 1 |
vssrlrn.bu.h | 4 | 2 | 4 | 1 |
vssrlrn.h.w | 4 | 2 | 4 | 1 |
vssrlrn.hu.w | 4 | 2 | 4 | 1 |
vssrlrn.w.d | 4 | 2 | 4 | 1 |
vssrlrn.wu.d | 4 | 2 | 4 | 1 |
vssrlrni.b.h | 4 | 2 | 4 | 1 |
vssrlrni.bu.h | 4 | 2 | 4 | 1 |
vssrlrni.d.q | 3 | 2 | 3 | 2 |
vssrlrni.du.q | 3 | 2 | 3 | 2 |
vssrlrni.h.w | 4 | 2 | 4 | 1 |
vssrlrni.hu.w | 4 | 2 | 4 | 1 |
vssrlrni.w.d | 4 | 2 | 4 | 1 |
vssrlrni.wu.d | 4 | 2 | 4 | 1 |
vssub.b | 1 | 4 | 1 | 2 |
vssub.bu | 1 | 4 | 1 | 2 |
vssub.d | 1 | 4 | 1 | 2 |
vssub.du | 1 | 4 | 1 | 2 |
vssub.h | 1 | 4 | 1 | 2 |
vssub.hu | 1 | 4 | 1 | 2 |
vssub.w | 1 | 4 | 1 | 2 |
vssub.wu | 1 | 4 | 1 | 2 |
vsub.b | 1 | 4 | 1 | 2 |
vsub.d | 1 | 4 | 1 | 2 |
vsub.h | 1 | 4 | 1 | 2 |
vsub.q | 3 | 2 | 3 | 2 |
vsub.w | 1 | 4 | 1 | 2 |
vsubi.bu | 1 | 4 | 1 | 2 |
vsubi.du | 1 | 4 | 1 | 2 |
vsubi.hu | 1 | 4 | 1 | 2 |
vsubi.wu | 1 | 4 | 1 | 2 |
vsubwev.d.w | 2 | 2 | 2 | 2 |
vsubwev.d.wu | 2 | 2 | 2 | 2 |
vsubwev.h.b | 2 | 2 | 2 | 2 |
vsubwev.h.bu | 2 | 2 | 2 | 2 |
vsubwev.q.d | 3 | 2 | 3 | 2 |
vsubwev.q.du | 3 | 2 | 3 | 2 |
vsubwev.w.h | 2 | 2 | 2 | 2 |
vsubwev.w.hu | 2 | 2 | 2 | 2 |
vsubwod.d.w | 2 | 2 | 2 | 2 |
vsubwod.d.wu | 2 | 2 | 2 | 2 |
vsubwod.h.b | 2 | 2 | 2 | 2 |
vsubwod.h.bu | 2 | 2 | 2 | 2 |
vsubwod.q.d | 3 | 2 | 3 | 2 |
vsubwod.q.du | 3 | 2 | 3 | 2 |
vsubwod.w.h | 2 | 2 | 2 | 2 |
vsubwod.w.hu | 2 | 2 | 2 | 2 |
vxor.v | 1 | 4 | 1 | 2 |
vxori.b | 1 | 4 | 1 | 2 |
xvabsd.b | 2 | 2 | 2 | 2 |
xvabsd.bu | 2 | 2 | 2 | 2 |
xvabsd.d | 2 | 2 | 2 | 2 |
xvabsd.du | 2 | 2 | 2 | 2 |
xvabsd.h | 2 | 2 | 2 | 2 |
xvabsd.hu | 2 | 2 | 2 | 2 |
xvabsd.w | 2 | 2 | 2 | 2 |
xvabsd.wu | 2 | 2 | 2 | 2 |
xvadd.b | 1 | 4 | 1 | 2 |
xvadd.d | 1 | 4 | 1 | 2 |
xvadd.h | 1 | 4 | 1 | 2 |
xvadd.q | 3 | 2 | 3 | 2 |
xvadd.w | 1 | 4 | 1 | 2 |
xvadda.b | 3 | 2 | 3 | 2 |
xvadda.d | 3 | 2 | 3 | 2 |
xvadda.h | 3 | 2 | 3 | 2 |
xvadda.w | 3 | 2 | 3 | 2 |
xvaddi.bu | 1 | 4 | 1 | 2 |
xvaddi.du | 1 | 4 | 1 | 2 |
xvaddi.hu | 1 | 4 | 1 | 2 |
xvaddi.wu | 1 | 4 | 1 | 2 |
xvaddwev.d.w | 2 | 2 | 2 | 2 |
xvaddwev.d.wu | 2 | 2 | 2 | 2 |
xvaddwev.d.wu.w | 2 | 2 | 2 | 2 |
xvaddwev.h.b | 2 | 2 | 2 | 2 |
xvaddwev.h.bu | 2 | 2 | 2 | 2 |
xvaddwev.h.bu.b | 2 | 2 | 2 | 2 |
xvaddwev.q.d | 3 | 2 | 3 | 2 |
xvaddwev.q.du | 3 | 2 | 3 | 2 |
xvaddwev.q.du.d | 3 | 2 | 3 | 2 |
xvaddwev.w.h | 2 | 2 | 2 | 2 |
xvaddwev.w.hu | 2 | 2 | 2 | 2 |
xvaddwev.w.hu.h | 2 | 2 | 2 | 2 |
xvaddwod.d.w | 2 | 2 | 2 | 2 |
xvaddwod.d.wu | 2 | 2 | 2 | 2 |
xvaddwod.d.wu.w | 2 | 2 | 2 | 2 |
xvaddwod.h.b | 2 | 2 | 2 | 2 |
xvaddwod.h.bu | 2 | 2 | 2 | 2 |
xvaddwod.h.bu.b | 2 | 2 | 2 | 2 |
xvaddwod.q.d | 3 | 2 | 3 | 2 |
xvaddwod.q.du | 3 | 2 | 3 | 2 |
xvaddwod.q.du.d | 3 | 2 | 3 | 2 |
xvaddwod.w.h | 2 | 2 | 2 | 2 |
xvaddwod.w.hu | 2 | 2 | 2 | 2 |
xvaddwod.w.hu.h | 2 | 2 | 2 | 2 |
xvand.v | 1 | 4 | 1 | 2 |
xvandi.b | 1 | 4 | 1 | 2 |
xvandn.v | 1 | 4 | 1 | 2 |
xvavg.b | 1 | 4 | 1 | 2 |
xvavg.bu | 1 | 4 | 1 | 2 |
xvavg.d | 2 | 4 | 2 | 2 |
xvavg.du | 2 | 4 | 2 | 2 |
xvavg.h | 1 | 4 | 1 | 2 |
xvavg.hu | 1 | 4 | 1 | 2 |
xvavg.w | 1 | 4 | 1 | 2 |
xvavg.wu | 1 | 4 | 1 | 2 |
xvavgr.b | 1 | 4 | 1 | 2 |
xvavgr.bu | 1 | 4 | 1 | 2 |
xvavgr.d | 2 | 4 | 2 | 2 |
xvavgr.du | 2 | 4 | 2 | 2 |
xvavgr.h | 1 | 4 | 1 | 2 |
xvavgr.hu | 1 | 4 | 1 | 2 |
xvavgr.w | 1 | 4 | 1 | 2 |
xvavgr.wu | 1 | 4 | 1 | 2 |
xvbitclr.b | 2 | 2 | 2 | 2 |
xvbitclr.d | 2 | 2 | 2 | 2 |
xvbitclr.h | 2 | 2 | 2 | 2 |
xvbitclr.w | 2 | 2 | 2 | 2 |
xvbitclri.b | 2 | 2 | 2 | 2 |
xvbitclri.d | 2 | 2 | 2 | 2 |
xvbitclri.h | 2 | 2 | 2 | 2 |
xvbitclri.w | 2 | 2 | 2 | 2 |
xvbitrev.b | 2 | 2 | 2 | 2 |
xvbitrev.d | 2 | 2 | 2 | 2 |
xvbitrev.h | 2 | 2 | 2 | 2 |
xvbitrev.w | 2 | 2 | 2 | 2 |
xvbitrevi.b | 2 | 2 | 2 | 2 |
xvbitrevi.d | 2 | 2 | 2 | 2 |
xvbitrevi.h | 2 | 2 | 2 | 2 |
xvbitrevi.w | 2 | 2 | 2 | 2 |
xvbitsel.v | 1 | 2 | 1 | 2 |
xvbitseli.b | 1 | 2 | 1 | 2 |
xvbitset.b | 2 | 2 | 2 | 2 |
xvbitset.d | 2 | 2 | 2 | 2 |
xvbitset.h | 2 | 2 | 2 | 2 |
xvbitset.w | 2 | 2 | 2 | 2 |
xvbitseti.b | 2 | 2 | 2 | 2 |
xvbitseti.d | 2 | 2 | 2 | 2 |
xvbitseti.h | 2 | 2 | 2 | 2 |
xvbitseti.w | 2 | 2 | 2 | 2 |
xvbsll.v | 1 | 4 | 1 | 2 |
xvbsrl.v | 1 | 4 | 1 | 2 |
xvclo.b | 2 | 4 | 2 | 2 |
xvclo.d | 2 | 4 | 2 | 2 |
xvclo.h | 2 | 4 | 2 | 2 |
xvclo.w | 2 | 4 | 2 | 2 |
xvclz.b | 2 | 4 | 2 | 2 |
xvclz.d | 2 | 4 | 2 | 2 |
xvclz.h | 2 | 4 | 2 | 2 |
xvclz.w | 2 | 4 | 2 | 2 |
xvdiv.b | 29, 32 | 0.06(1/15.5) | 32, 36 | 0.05(1/20.5) |
xvdiv.bu | 29, 33 | 0.06(1/16.5) | 29, 36 | 0.05(1/20.5) |
xvdiv.d | 8 | 0.25(1/4) | 8, 18.5 | 0.11(1/9) |
xvdiv.du | 8 | 0.25(1/4) | 8, 18.5 | 0.11(1/9) |
xvdiv.h | 17 | 0.12(1/8.5) | 21.5, 22 | 0.08(1/13) |
xvdiv.hu | 17, 22 | 0.11(1/9) | 17, 21.5 | 0.07(1/15) |
xvdiv.w | 11 | 0.18(1/5.5) | 11, 17.5 | 0.09(1/11.5) |
xvdiv.wu | 11 | 0.18(1/5.5) | 11, 17.5 | 0.07(1/15) |
xvexth.d.w | 1 | 4 | 1 | 2 |
xvexth.du.wu | 1 | 4 | 1 | 2 |
xvexth.h.b | 1 | 4 | 1 | 2 |
xvexth.hu.bu | 1 | 4 | 1 | 2 |
xvexth.q.d | 1 | 4 | 1 | 2 |
xvexth.qu.du | 1 | 4 | 1 | 2 |
xvexth.w.h | 1 | 4 | 1 | 2 |
xvexth.wu.hu | 1 | 4 | 1 | 2 |
xvextl.q.d | 1 | 4 | 1 | 2 |
xvextl.qu.du | 1 | 4 | 1 | 2 |
xvextrins.b | 1 | 4 | 1 | 2 |
xvextrins.d | 1 | 4 | 1 | 2 |
xvextrins.h | 1 | 4 | 1 | 2 |
xvextrins.w | 1 | 4 | 1 | 2 |
xvfadd.d | 3 | 4 | 5 | 2 |
xvfadd.s | 3 | 4 | 5 | 2 |
xvfclass.d | 2 | 4 | 2 | 2 |
xvfclass.s | 2 | 4 | 2 | 2 |
xvfcmp.caf.d | 2 | 4 | 2 | 2 |
xvfcmp.caf.s | 2 | 4 | 2 | 2 |
xvfcmp.ceq.d | 2 | 4 | 2 | 2 |
xvfcmp.ceq.s | 2 | 4 | 2 | 2 |
xvfcmp.cle.d | 2 | 4 | 2 | 2 |
xvfcmp.cle.s | 2 | 4 | 2 | 2 |
xvfcmp.clt.d | 2 | 4 | 2 | 2 |
xvfcmp.clt.s | 2 | 4 | 2 | 2 |
xvfcmp.cne.d | 2 | 4 | 2 | 2 |
xvfcmp.cne.s | 2 | 4 | 2 | 2 |
xvfcmp.cor.d | 2 | 4 | 2 | 2 |
xvfcmp.cor.s | 2 | 4 | 2 | 2 |
xvfcmp.cueq.d | 2 | 4 | 2 | 2 |
xvfcmp.cueq.s | 2 | 4 | 2 | 2 |
xvfcmp.cule.d | 2 | 4 | 2 | 2 |
xvfcmp.cule.s | 2 | 4 | 2 | 2 |
xvfcmp.cult.d | 2 | 4 | 2 | 2 |
xvfcmp.cult.s | 2 | 4 | 2 | 2 |
xvfcmp.cun.d | 2 | 4 | 2 | 2 |
xvfcmp.cun.s | 2 | 4 | 2 | 2 |
xvfcmp.cune.d | 2 | 4 | 2 | 2 |
xvfcmp.cune.s | 2 | 4 | 2 | 2 |
xvfcmp.saf.d | 2 | 4 | 2 | 2 |
xvfcmp.saf.s | 2 | 4 | 2 | 2 |
xvfcmp.seq.d | 2 | 4 | 2 | 2 |
xvfcmp.seq.s | 2 | 4 | 2 | 2 |
xvfcmp.sle.d | 2 | 4 | 2 | 2 |
xvfcmp.sle.s | 2 | 4 | 2 | 2 |
xvfcmp.slt.d | 2 | 4 | 2 | 2 |
xvfcmp.slt.s | 2 | 4 | 2 | 2 |
xvfcmp.sne.d | 2 | 4 | 2 | 2 |
xvfcmp.sne.s | 2 | 4 | 2 | 2 |
xvfcmp.sor.d | 2 | 4 | 2 | 2 |
xvfcmp.sor.s | 2 | 4 | 2 | 2 |
xvfcmp.sueq.d | 2 | 4 | 2 | 2 |
xvfcmp.sueq.s | 2 | 4 | 2 | 2 |
xvfcmp.sule.d | 2 | 4 | 2 | 2 |
xvfcmp.sule.s | 2 | 4 | 2 | 2 |
xvfcmp.sult.d | 2 | 4 | 2 | 2 |
xvfcmp.sult.s | 2 | 4 | 2 | 2 |
xvfcmp.sun.d | 2 | 4 | 2 | 2 |
xvfcmp.sun.s | 2 | 4 | 2 | 2 |
xvfcmp.sune.d | 2 | 4 | 2 | 2 |
xvfcmp.sune.s | 2 | 4 | 2 | 2 |
xvfcvt.h.s | 3 | 2 | 3 | 1 |
xvfcvt.s.d | 3 | 2 | 3 | 1 |
xvfcvth.d.s | 3 | 2 | 3 | 1 |
xvfcvth.s.h | 3 | 2 | 3 | 1 |
xvfcvtl.d.s | 3 | 2 | 3 | 1 |
xvfcvtl.s.h | 3 | 2 | 3 | 1 |
xvfdiv.d | 8, 21.5 | 0.25(1/4) | 8, 17 | 0.08(1/12.5) |
xvfdiv.s | 11 | 0.18(1/5.5) | 11, 19.5 | 0.1(1/10.5) |
xvffint.d.l | 4 | 4 | 4 | 2 |
xvffint.d.lu | 4 | 4 | 4 | 2 |
xvffint.s.l | 5 | 2 | 5 | 1 |
xvffint.s.w | 4 | 4 | 4 | 2 |
xvffint.s.wu | 4 | 4 | 4 | 2 |
xvffinth.d.w | 5 | 2 | 5 | 1 |
xvffintl.d.w | 5 | 2 | 5 | 1 |
xvflogb.d | 4 | 4 | 4 | 2 |
xvflogb.s | 4 | 4 | 4 | 2 |
xvfmadd.d | 5 | 2 | 5 | 2 |
xvfmadd.s | 5 | 2 | 5 | 2 |
xvfmax.d | 2 | 4 | 2 | 2 |
xvfmax.s | 2 | 4 | 2 | 2 |
xvfmaxa.d | 2 | 4 | 2 | 2 |
xvfmaxa.s | 2 | 4 | 2 | 2 |
xvfmin.d | 2 | 4 | 2 | 2 |
xvfmin.s | 2 | 4 | 2 | 2 |
xvfmina.d | 2 | 4 | 2 | 2 |
xvfmina.s | 2 | 4 | 2 | 2 |
xvfmsub.d | 5 | 2 | 5 | 2 |
xvfmsub.s | 5 | 2 | 5 | 2 |
xvfmul.d | 5 | 2 | 5 | 2 |
xvfmul.s | 5 | 2 | 5 | 2 |
xvfnmadd.d | 5 | 2 | 5 | 2 |
xvfnmadd.s | 5 | 2 | 5 | 2 |
xvfnmsub.d | 5 | 2 | 5 | 2 |
xvfnmsub.s | 5 | 2 | 5 | 2 |
xvfrecip.d | 23 | 0.25(1/4) | 23 | 0.08(1/12) |
xvfrecip.s | 27 | 0.18(1/5.5) | 27 | 0.14(1/7) |
xvfrint.d | 4 | 2 | 4 | 2 |
xvfrint.s | 4 | 2 | 4 | 2 |
xvfrintrm.d | 4 | 2 | 4 | 2 |
xvfrintrm.s | 4 | 2 | 4 | 2 |
xvfrintrne.d | 4 | 2 | 4 | 2 |
xvfrintrne.s | 4 | 2 | 4 | 2 |
xvfrintrp.d | 4 | 2 | 4 | 2 |
xvfrintrp.s | 4 | 2 | 4 | 2 |
xvfrintrz.d | 4 | 2 | 4 | 2 |
xvfrintrz.s | 4 | 2 | 4 | 2 |
xvfrsqrt.d | 15 | 0.04(1/26.5) | 15 | 0.04(1/27.5) |
xvfrsqrt.s | 25 | 0.05(1/19) | 25 | 0.03(1/32) |
xvfrstp.b | 2 | 2 | 2 | 2 |
xvfrstp.h | 2 | 2 | 2 | 2 |
xvfrstpi.b | 2 | 2 | 2 | 2 |
xvfrstpi.h | 2 | 2 | 2 | 2 |
xvfsqrt.d | 36 | 0.06(1/17.5) | 36 | 0.05(1/18.5) |
xvfsqrt.s | 15 | 0.08(1/12) | 15 | 0.07(1/13.5) |
xvfsub.d | 3 | 4 | 5 | 2 |
xvfsub.s | 3 | 4 | 5 | 2 |
xvftint.l.d | 4 | 4 | 4 | 2 |
xvftint.lu.d | 4 | 4 | 4 | 2 |
xvftint.w.d | 5 | 2 | 5 | 1 |
xvftint.w.s | 4 | 4 | 4 | 2 |
xvftint.wu.s | 4 | 4 | 4 | 2 |
xvftinth.l.s | 5 | 2 | 5 | 1 |
xvftintl.l.s | 5 | 2 | 5 | 1 |
xvftintrm.l.d | 4 | 4 | 4 | 2 |
xvftintrm.w.d | 5 | 2 | 5 | 1 |
xvftintrm.w.s | 4 | 4 | 4 | 2 |
xvftintrmh.l.s | 5 | 2 | 5 | 1 |
xvftintrml.l.s | 5 | 2 | 5 | 1 |
xvftintrne.l.d | 4 | 4 | 4 | 2 |
xvftintrne.w.d | 5 | 2 | 5 | 1 |
xvftintrne.w.s | 4 | 4 | 4 | 2 |
xvftintrneh.l.s | 5 | 2 | 5 | 1 |
xvftintrnel.l.s | 5 | 2 | 5 | 1 |
xvftintrp.l.d | 4 | 4 | 4 | 2 |
xvftintrp.w.d | 5 | 2 | 5 | 1 |
xvftintrp.w.s | 4 | 4 | 4 | 2 |
xvftintrph.l.s | 5 | 2 | 5 | 1 |
xvftintrpl.l.s | 5 | 2 | 5 | 1 |
xvftintrz.l.d | 4 | 4 | 4 | 2 |
xvftintrz.lu.d | 4 | 4 | 4 | 2 |
xvftintrz.w.d | 5 | 2 | 5 | 1 |
xvftintrz.w.s | 4 | 4 | 4 | 2 |
xvftintrz.wu.s | 4 | 4 | 4 | 2 |
xvftintrzh.l.s | 5 | 2 | 5 | 1 |
xvftintrzl.l.s | 5 | 2 | 5 | 1 |
xvhaddw.d.w | 2 | 2 | 2 | 2 |
xvhaddw.du.wu | 2 | 2 | 2 | 2 |
xvhaddw.h.b | 2 | 2 | 2 | 2 |
xvhaddw.hu.bu | 2 | 2 | 2 | 2 |
xvhaddw.q.d | 3 | 2 | 3 | 2 |
xvhaddw.qu.du | 3 | 2 | 3 | 2 |
xvhaddw.w.h | 2 | 2 | 2 | 2 |
xvhaddw.wu.hu | 2 | 2 | 2 | 2 |
xvhseli.d | 1 | 1 | 1 | 1 |
xvhsubw.d.w | 2 | 2 | 2 | 2 |
xvhsubw.du.wu | 2 | 2 | 2 | 2 |
xvhsubw.h.b | 2 | 2 | 2 | 2 |
xvhsubw.hu.bu | 2 | 2 | 2 | 2 |
xvhsubw.q.d | 3 | 2 | 3 | 2 |
xvhsubw.qu.du | 3 | 2 | 3 | 2 |
xvhsubw.w.h | 2 | 2 | 2 | 2 |
xvhsubw.wu.hu | 2 | 2 | 2 | 2 |
xvilvh.b | 1 | 4 | 1 | 2 |
xvilvh.d | 1 | 4 | 1 | 2 |
xvilvh.h | 1 | 4 | 1 | 2 |
xvilvh.w | 1 | 4 | 1 | 2 |
xvilvl.b | 1 | 4 | 1 | 2 |
xvilvl.d | 1 | 4 | 1 | 2 |
xvilvl.h | 1 | 4 | 1 | 2 |
xvilvl.w | 1 | 4 | 1 | 2 |
xvinsgr2vr.d | 1 | 1 | 1 | 1 |
xvinsgr2vr.w | 1 | 1 | 1 | 1 |
xvinsve0.d | 1 | 4 | 1 | 2 |
xvinsve0.w | 1 | 4 | 1 | 2 |
xvmadd.b | 4 | 2 | 4 | 2 |
xvmadd.d | 4 | 2 | 4 | 2 |
xvmadd.h | 4 | 2 | 4 | 2 |
xvmadd.w | 4 | 2 | 4 | 2 |
xvmaddwev.d.w | 4 | 2 | 4 | 2 |
xvmaddwev.d.wu | 4 | 2 | 4 | 2 |
xvmaddwev.d.wu.w | 4 | 2 | 4 | 2 |
xvmaddwev.h.b | 4 | 2 | 4 | 2 |
xvmaddwev.h.bu | 4 | 2 | 4 | 2 |
xvmaddwev.h.bu.b | 4 | 2 | 4 | 2 |
xvmaddwev.q.d | 7 | 1.14 | 7 | 1.14 |
xvmaddwev.q.du | 7 | 1.14 | 7 | 1.14 |
xvmaddwev.q.du.d | 7 | 1.14 | 7 | 1.14 |
xvmaddwev.w.h | 4 | 2 | 4 | 2 |
xvmaddwev.w.hu | 4 | 2 | 4 | 2 |
xvmaddwev.w.hu.h | 4 | 2 | 4 | 2 |
xvmaddwod.d.w | 4 | 2 | 4 | 2 |
xvmaddwod.d.wu | 4 | 2 | 4 | 2 |
xvmaddwod.d.wu.w | 4 | 2 | 4 | 2 |
xvmaddwod.h.b | 4 | 2 | 4 | 2 |
xvmaddwod.h.bu | 4 | 2 | 4 | 2 |
xvmaddwod.h.bu.b | 4 | 2 | 4 | 2 |
xvmaddwod.q.d | 7 | 1.14 | 7 | 1.14 |
xvmaddwod.q.du | 7 | 1.14 | 7 | 1.14 |
xvmaddwod.q.du.d | 7 | 1.14 | 7 | 1.14 |
xvmaddwod.w.h | 4 | 2 | 4 | 2 |
xvmaddwod.w.hu | 4 | 2 | 4 | 2 |
xvmaddwod.w.hu.h | 4 | 2 | 4 | 2 |
xvmax.b | 1 | 4 | 1 | 2 |
xvmax.bu | 1 | 4 | 1 | 2 |
xvmax.d | 2 | 4 | 2 | 2 |
xvmax.du | 2 | 4 | 2 | 2 |
xvmax.h | 1 | 4 | 1 | 2 |
xvmax.hu | 1 | 4 | 1 | 2 |
xvmax.w | 1 | 4 | 1 | 2 |
xvmax.wu | 1 | 4 | 1 | 2 |
xvmaxi.b | 1 | 4 | 1 | 2 |
xvmaxi.bu | 1 | 4 | 1 | 2 |
xvmaxi.d | 2 | 4 | 2 | 2 |
xvmaxi.du | 2 | 4 | 2 | 2 |
xvmaxi.h | 1 | 4 | 1 | 2 |
xvmaxi.hu | 1 | 4 | 1 | 2 |
xvmaxi.w | 1 | 4 | 1 | 2 |
xvmaxi.wu | 1 | 4 | 1 | 2 |
xvmin.b | 1 | 4 | 1 | 2 |
xvmin.bu | 1 | 4 | 1 | 2 |
xvmin.d | 2 | 4 | 2 | 2 |
xvmin.du | 2 | 4 | 2 | 2 |
xvmin.h | 1 | 4 | 1 | 2 |
xvmin.hu | 1 | 4 | 1 | 2 |
xvmin.w | 1 | 4 | 1 | 2 |
xvmin.wu | 1 | 4 | 1 | 2 |
xvmini.b | 1 | 4 | 1 | 2 |
xvmini.bu | 1 | 4 | 1 | 2 |
xvmini.d | 2 | 4 | 2 | 2 |
xvmini.du | 2 | 4 | 2 | 2 |
xvmini.h | 1 | 4 | 1 | 2 |
xvmini.hu | 1 | 4 | 1 | 2 |
xvmini.w | 1 | 4 | 1 | 2 |
xvmini.wu | 1 | 4 | 1 | 2 |
xvmod.b | 29, 41 | 0.06(1/15.5) | 29, 33 | 0.05(1/21.5) |
xvmod.bu | 29, 37 | 0.06(1/17.5) | 29, 37 | 0.05(1/22) |
xvmod.d | 8, 10 | 0.25(1/4) | 8, 10 | 0.11(1/9.5) |
xvmod.du | 8, 10 | 0.25(1/4) | 8, 10 | 0.11(1/9.5) |
xvmod.h | 17, 21 | 0.12(1/8.5) | 17, 21 | 0.07(1/13.5) |
xvmod.hu | 17, 25 | 0.11(1/9.5) | 17, 23 | 0.06(1/16) |
xvmod.w | 11, 13 | 0.18(1/5.5) | 11, 15 | 0.07(1/13.5) |
xvmod.wu | 11, 13 | 0.18(1/5.5) | 11, 15 | 0.06(1/16) |
xvmskgez.b | 1 | 4 | 1 | 2 |
xvmskltz.b | 1 | 4 | 1 | 2 |
xvmskltz.d | 1 | 4 | 1 | 2 |
xvmskltz.h | 1 | 4 | 1 | 2 |
xvmskltz.w | 1 | 4 | 1 | 2 |
xvmsknz.b | 1 | 4 | 1 | 2 |
xvmsub.b | 4 | 2 | 4 | 2 |
xvmsub.d | 4 | 2 | 4 | 2 |
xvmsub.h | 4 | 2 | 4 | 2 |
xvmsub.w | 4 | 2 | 4 | 2 |
xvmuh.b | 4 | 2 | 4 | 2 |
xvmuh.bu | 4 | 2 | 4 | 2 |
xvmuh.d | 4 | 2 | 4 | 2 |
xvmuh.du | 4 | 2 | 4 | 2 |
xvmuh.h | 4 | 2 | 4 | 2 |
xvmuh.hu | 4 | 2 | 4 | 2 |
xvmuh.w | 4 | 2 | 4 | 2 |
xvmuh.wu | 4 | 2 | 4 | 2 |
xvmul.b | 4 | 2 | 4 | 2 |
xvmul.d | 4 | 2 | 4 | 2 |
xvmul.h | 4 | 2 | 4 | 2 |
xvmul.w | 4 | 2 | 4 | 2 |
xvmulwev.d.w | 4 | 2 | 4 | 2 |
xvmulwev.d.wu | 4 | 2 | 4 | 2 |
xvmulwev.d.wu.w | 4 | 2 | 4 | 2 |
xvmulwev.h.b | 4 | 2 | 4 | 2 |
xvmulwev.h.bu | 4 | 2 | 4 | 2 |
xvmulwev.h.bu.b | 4 | 2 | 4 | 2 |
xvmulwev.q.d | 7 | 2 | 7 | 2 |
xvmulwev.q.du | 7 | 2 | 7 | 2 |
xvmulwev.q.du.d | 7 | 2 | 7 | 2 |
xvmulwev.w.h | 4 | 2 | 4 | 2 |
xvmulwev.w.hu | 4 | 2 | 4 | 2 |
xvmulwev.w.hu.h | 4 | 2 | 4 | 2 |
xvmulwod.d.w | 4 | 2 | 4 | 2 |
xvmulwod.d.wu | 4 | 2 | 4 | 2 |
xvmulwod.d.wu.w | 4 | 2 | 4 | 2 |
xvmulwod.h.b | 4 | 2 | 4 | 2 |
xvmulwod.h.bu | 4 | 2 | 4 | 2 |
xvmulwod.h.bu.b | 4 | 2 | 4 | 2 |
xvmulwod.q.d | 7 | 2 | 7 | 2 |
xvmulwod.q.du | 7 | 2 | 7 | 2 |
xvmulwod.q.du.d | 7 | 2 | 7 | 2 |
xvmulwod.w.h | 4 | 2 | 4 | 2 |
xvmulwod.w.hu | 4 | 2 | 4 | 2 |
xvmulwod.w.hu.h | 4 | 2 | 4 | 2 |
xvneg.b | 1 | 4 | 1 | 2 |
xvneg.d | 1 | 4 | 1 | 2 |
xvneg.h | 1 | 4 | 1 | 2 |
xvneg.w | 1 | 4 | 1 | 2 |
xvnor.v | 1 | 4 | 1 | 2 |
xvnori.b | 1 | 4 | 1 | 2 |
xvor.v | 1 | 4 | 1 | 2 |
xvori.b | 1 | 4 | 1 | 2 |
xvorn.v | 1 | 4 | 1 | 2 |
xvpackev.b | 1 | 4 | 1 | 2 |
xvpackev.d | 1 | 4 | 1 | 2 |
xvpackev.h | 1 | 4 | 1 | 2 |
xvpackev.w | 1 | 4 | 1 | 2 |
xvpackod.b | 1 | 4 | 1 | 2 |
xvpackod.d | 1 | 4 | 1 | 2 |
xvpackod.h | 1 | 4 | 1 | 2 |
xvpackod.w | 1 | 4 | 1 | 2 |
xvpcnt.b | 2 | 2 | 2 | 2 |
xvpcnt.d | 2 | 2 | 2 | 2 |
xvpcnt.h | 2 | 2 | 2 | 2 |
xvpcnt.w | 2 | 2 | 2 | 2 |
xvperm.w | 3 | 4 | 3 | 2 |
xvpermi.d | 3 | 4 | 3 | 2 |
xvpermi.q | 3 | 2.67 | 3 | 2 |
xvpermi.w | 1 | 4 | 1 | 2 |
xvpickev.b | 1 | 4 | 1 | 2 |
xvpickev.d | 1 | 4 | 1 | 2 |
xvpickev.h | 1 | 4 | 1 | 2 |
xvpickev.w | 1 | 4 | 1 | 2 |
xvpickod.b | 1 | 4 | 1 | 2 |
xvpickod.d | 1 | 4 | 1 | 2 |
xvpickod.h | 1 | 4 | 1 | 2 |
xvpickod.w | 1 | 4 | 1 | 2 |
xvpickve2gr.d | 1 | 1 | 1 | 1 |
xvpickve2gr.du | 1 | 1 | 1 | 1 |
xvpickve2gr.w | 1 | 1 | 1 | 1 |
xvpickve2gr.wu | 1 | 1 | 1 | 1 |
xvpickve.d | 3 | 4 | 3 | 2 |
xvpickve.w | 3 | 4 | 3 | 2 |
xvrepl128vei.b | 1 | 4 | 1 | 2 |
xvrepl128vei.d | 1 | 4 | 1 | 2 |
xvrepl128vei.h | 1 | 4 | 1 | 2 |
xvrepl128vei.w | 1 | 4 | 1 | 2 |
xvreplgr2vr.b | N/A | 1 | N/A | 1 |
xvreplgr2vr.d | N/A | 1 | N/A | 1 |
xvreplgr2vr.h | N/A | 1 | N/A | 1 |
xvreplgr2vr.w | N/A | 1 | N/A | 1 |
xvrepli.b | N/A | 6 | N/A | 2 |
xvrepli.d | N/A | 4 | N/A | 2 |
xvrepli.h | N/A | 4 | N/A | 2 |
xvrepli.w | N/A | 4 | N/A | 2 |
xvreplve0.b | 3 | 4 | 3 | 2 |
xvreplve0.d | 3 | 4 | 3 | 2 |
xvreplve0.h | 3 | 4 | 3 | 2 |
xvreplve0.q | 3 | 4 | 3 | 2 |
xvreplve0.w | 3 | 4 | 3 | 2 |
xvreplve.b | 1 | 1 | 1 | 1 |
xvreplve.d | 1 | 1 | 1 | 1 |
xvreplve.h | 1 | 1 | 1 | 1 |
xvreplve.w | 1 | 1 | 1 | 1 |
xvrotr.b | 1 | 4 | 2 | 2 |
xvrotr.d | 1 | 4 | 2 | 2 |
xvrotr.h | 1 | 4 | 2 | 2 |
xvrotr.w | 1 | 4 | 2 | 2 |
xvrotri.b | 1 | 4 | 2 | 2 |
xvrotri.d | 1 | 4 | 2 | 2 |
xvrotri.h | 1 | 4 | 2 | 2 |
xvrotri.w | 1 | 4 | 2 | 2 |
xvsadd.b | 1 | 4 | 1 | 2 |
xvsadd.bu | 1 | 4 | 1 | 2 |
xvsadd.d | 1 | 4 | 1 | 2 |
xvsadd.du | 1 | 4 | 1 | 2 |
xvsadd.h | 1 | 4 | 1 | 2 |
xvsadd.hu | 1 | 4 | 1 | 2 |
xvsadd.w | 1 | 4 | 1 | 2 |
xvsadd.wu | 1 | 4 | 1 | 2 |
xvsat.b | 2 | 2 | 2 | 2 |
xvsat.bu | 2 | 2 | 2 | 2 |
xvsat.d | 2 | 2 | 2 | 2 |
xvsat.du | 2 | 2 | 2 | 2 |
xvsat.h | 2 | 2 | 2 | 2 |
xvsat.hu | 2 | 2 | 2 | 2 |
xvsat.w | 2 | 2 | 2 | 2 |
xvsat.wu | 2 | 2 | 2 | 2 |
xvseq.b | 1 | 4 | 1 | 2 |
xvseq.d | 1 | 4 | 1 | 2 |
xvseq.h | 1 | 4 | 1 | 2 |
xvseq.w | 1 | 4 | 1 | 2 |
xvseqi.b | 1 | 4 | 1 | 2 |
xvseqi.d | 1 | 4 | 1 | 2 |
xvseqi.h | 1 | 4 | 1 | 2 |
xvseqi.w | 1 | 4 | 1 | 2 |
xvsetallnez.b | N/A | 2 | N/A | 2 |
xvsetallnez.d | N/A | 2 | N/A | 2 |
xvsetallnez.h | N/A | 2 | N/A | 2 |
xvsetallnez.w | N/A | 2 | N/A | 2 |
xvsetanyeqz.b | N/A | 2 | N/A | 2 |
xvsetanyeqz.d | N/A | 2 | N/A | 2 |
xvsetanyeqz.h | N/A | 2 | N/A | 2 |
xvsetanyeqz.w | N/A | 2 | N/A | 2 |
xvseteqz.v | N/A | 2 | N/A | 2 |
xvsetnez.v | N/A | 2 | N/A | 2 |
xvshuf4i.b | 1 | 4 | 1 | 2 |
xvshuf4i.d | 1 | 4 | 1 | 2 |
xvshuf4i.h | 1 | 4 | 1 | 2 |
xvshuf4i.w | 1 | 4 | 1 | 2 |
xvshuf.b | 1 | 2 | 1 | 2 |
xvshuf.d | 1 | 2 | 1 | 2 |
xvshuf.h | 1 | 2 | 1 | 2 |
xvshuf.w | 1 | 2 | 1 | 2 |
xvsigncov.b | 1 | 2 | 1 | 2 |
xvsigncov.d | 1 | 2 | 1 | 2 |
xvsigncov.h | 1 | 2 | 1 | 2 |
xvsigncov.w | 1 | 2 | 1 | 2 |
xvsle.b | 1 | 4 | 1 | 2 |
xvsle.bu | 1 | 4 | 1 | 2 |
xvsle.d | 2 | 4 | 2 | 2 |
xvsle.du | 2 | 4 | 2 | 2 |
xvsle.h | 1 | 4 | 1 | 2 |
xvsle.hu | 1 | 4 | 1 | 2 |
xvsle.w | 1 | 4 | 1 | 2 |
xvsle.wu | 1 | 4 | 1 | 2 |
xvslei.b | 1 | 4 | 1 | 2 |
xvslei.bu | 1 | 4 | 1 | 2 |
xvslei.d | 2 | 4 | 2 | 2 |
xvslei.du | 2 | 4 | 2 | 2 |
xvslei.h | 1 | 4 | 1 | 2 |
xvslei.hu | 1 | 4 | 1 | 2 |
xvslei.w | 1 | 4 | 1 | 2 |
xvslei.wu | 1 | 4 | 1 | 2 |
xvsll.b | 1 | 4 | 1 | 2 |
xvsll.d | 1 | 4 | 1 | 2 |
xvsll.h | 1 | 4 | 1 | 2 |
xvsll.w | 1 | 4 | 1 | 2 |
xvslli.b | 1 | 4 | 1 | 2 |
xvslli.d | 1 | 4 | 1 | 2 |
xvslli.h | 1 | 4 | 1 | 2 |
xvslli.w | 1 | 4 | 1 | 2 |
xvsllwil.d.w | 2 | 2 | 2 | 1 |
xvsllwil.du.wu | 2 | 2 | 2 | 1 |
xvsllwil.h.b | 2 | 2 | 2 | 1 |
xvsllwil.hu.bu | 2 | 2 | 2 | 1 |
xvsllwil.w.h | 2 | 2 | 2 | 1 |
xvsllwil.wu.hu | 2 | 2 | 2 | 1 |
xvslt.b | 1 | 4 | 1 | 2 |
xvslt.bu | 1 | 4 | 1 | 2 |
xvslt.d | 2 | 4 | 2 | 2 |
xvslt.du | 2 | 4 | 2 | 2 |
xvslt.h | 1 | 4 | 1 | 2 |
xvslt.hu | 1 | 4 | 1 | 2 |
xvslt.w | 1 | 4 | 1 | 2 |
xvslt.wu | 1 | 4 | 1 | 2 |
xvslti.b | 1 | 4 | 1 | 2 |
xvslti.bu | 1 | 4 | 1 | 2 |
xvslti.d | 2 | 4 | 2 | 2 |
xvslti.du | 2 | 4 | 2 | 2 |
xvslti.h | 1 | 4 | 1 | 2 |
xvslti.hu | 1 | 4 | 1 | 2 |
xvslti.w | 1 | 4 | 1 | 2 |
xvslti.wu | 1 | 4 | 1 | 2 |
xvsra.b | 1 | 4 | 1 | 2 |
xvsra.d | 1 | 4 | 1 | 2 |
xvsra.h | 1 | 4 | 1 | 2 |
xvsra.w | 1 | 4 | 1 | 2 |
xvsrai.b | 1 | 4 | 1 | 2 |
xvsrai.d | 1 | 4 | 1 | 2 |
xvsrai.h | 1 | 4 | 1 | 2 |
xvsrai.w | 1 | 4 | 1 | 2 |
xvsran.b.h | 2 | 2 | 2 | 1 |
xvsran.h.w | 2 | 2 | 2 | 1 |
xvsran.w.d | 2 | 2 | 2 | 1 |
xvsrani.b.h | 4 | 2 | 4 | 1 |
xvsrani.d.q | 3 | 2 | 3 | 2 |
xvsrani.h.w | 4 | 2 | 4 | 1 |
xvsrani.w.d | 4 | 2 | 4 | 1 |
xvsrar.b | 3 | 2 | 3 | 2 |
xvsrar.d | 3 | 2 | 3 | 2 |
xvsrar.h | 3 | 2 | 3 | 2 |
xvsrar.w | 3 | 2 | 3 | 2 |
xvsrari.b | 3 | 2 | 3 | 2 |
xvsrari.d | 3 | 2 | 3 | 2 |
xvsrari.h | 3 | 2 | 3 | 2 |
xvsrari.w | 3 | 2 | 3 | 2 |
xvsrarn.b.h | 4 | 2 | 4 | 1 |
xvsrarn.h.w | 4 | 2 | 4 | 1 |
xvsrarn.w.d | 4 | 2 | 4 | 1 |
xvsrarni.b.h | 4 | 2 | 4 | 1 |
xvsrarni.d.q | 3 | 2 | 3 | 2 |
xvsrarni.h.w | 4 | 2 | 4 | 1 |
xvsrarni.w.d | 4 | 2 | 4 | 1 |
xvsrl.b | 1 | 4 | 1 | 2 |
xvsrl.d | 1 | 4 | 1 | 2 |
xvsrl.h | 1 | 4 | 1 | 2 |
xvsrl.w | 1 | 4 | 1 | 2 |
xvsrli.b | 1 | 4 | 1 | 2 |
xvsrli.d | 1 | 4 | 1 | 2 |
xvsrli.h | 1 | 4 | 1 | 2 |
xvsrli.w | 1 | 4 | 1 | 2 |
xvsrln.b.h | 2 | 2 | 2 | 1 |
xvsrln.h.w | 2 | 2 | 2 | 1 |
xvsrln.w.d | 2 | 2 | 2 | 1 |
xvsrlni.b.h | 4 | 2 | 4 | 1 |
xvsrlni.d.q | 3 | 2 | 3 | 2 |
xvsrlni.h.w | 4 | 2 | 4 | 1 |
xvsrlni.w.d | 4 | 2 | 4 | 1 |
xvsrlr.b | 3 | 2 | 3 | 2 |
xvsrlr.d | 3 | 2 | 3 | 2 |
xvsrlr.h | 3 | 2 | 3 | 2 |
xvsrlr.w | 3 | 2 | 3 | 2 |
xvsrlri.b | 3 | 2 | 3 | 2 |
xvsrlri.d | 3 | 2 | 3 | 2 |
xvsrlri.h | 3 | 2 | 3 | 2 |
xvsrlri.w | 3 | 2 | 3 | 2 |
xvsrlrn.b.h | 4 | 2 | 4 | 1 |
xvsrlrn.h.w | 4 | 2 | 4 | 1 |
xvsrlrn.w.d | 4 | 2 | 4 | 1 |
xvsrlrni.b.h | 4 | 2 | 4 | 1 |
xvsrlrni.d.q | 3 | 2 | 3 | 2 |
xvsrlrni.h.w | 4 | 2 | 4 | 1 |
xvsrlrni.w.d | 4 | 2 | 4 | 1 |
xvssran.b.h | 4 | 2 | 4 | 1 |
xvssran.bu.h | 4 | 2 | 4 | 1 |
xvssran.h.w | 4 | 2 | 4 | 1 |
xvssran.hu.w | 4 | 2 | 4 | 1 |
xvssran.w.d | 4 | 2 | 4 | 1 |
xvssran.wu.d | 4 | 2 | 4 | 1 |
xvssrani.b.h | 4 | 2 | 4 | 1 |
xvssrani.bu.h | 4 | 2 | 4 | 1 |
xvssrani.d.q | 3 | 2 | 3 | 2 |
xvssrani.du.q | 3 | 2 | 3 | 2 |
xvssrani.h.w | 4 | 2 | 4 | 1 |
xvssrani.hu.w | 4 | 2 | 4 | 1 |
xvssrani.w.d | 4 | 2 | 4 | 1 |
xvssrani.wu.d | 4 | 2 | 4 | 1 |
xvssrarn.b.h | 4 | 2 | 4 | 1 |
xvssrarn.bu.h | 4 | 2 | 4 | 1 |
xvssrarn.h.w | 4 | 2 | 4 | 1 |
xvssrarn.hu.w | 4 | 2 | 4 | 1 |
xvssrarn.w.d | 4 | 2 | 4 | 1 |
xvssrarn.wu.d | 4 | 2 | 4 | 1 |
xvssrarni.b.h | 4 | 2 | 4 | 1 |
xvssrarni.bu.h | 4 | 2 | 4 | 1 |
xvssrarni.d.q | 3 | 2 | 3 | 2 |
xvssrarni.du.q | 3 | 2 | 3 | 2 |
xvssrarni.h.w | 4 | 2 | 4 | 1 |
xvssrarni.hu.w | 4 | 2 | 4 | 1 |
xvssrarni.w.d | 4 | 2 | 4 | 1 |
xvssrarni.wu.d | 4 | 2 | 4 | 1 |
xvssrln.b.h | 4 | 2 | 4 | 1 |
xvssrln.bu.h | 4 | 2 | 4 | 1 |
xvssrln.h.w | 4 | 2 | 4 | 1 |
xvssrln.hu.w | 4 | 2 | 4 | 1 |
xvssrln.w.d | 4 | 2 | 4 | 1 |
xvssrln.wu.d | 4 | 2 | 4 | 1 |
xvssrlni.b.h | 4 | 2 | 4 | 1 |
xvssrlni.bu.h | 4 | 2 | 4 | 1 |
xvssrlni.d.q | 3 | 2 | 3 | 2 |
xvssrlni.du.q | 3 | 2 | 3 | 2 |
xvssrlni.h.w | 4 | 2 | 4 | 1 |
xvssrlni.hu.w | 4 | 2 | 4 | 1 |
xvssrlni.w.d | 4 | 2 | 4 | 1 |
xvssrlni.wu.d | 4 | 2 | 4 | 1 |
xvssrlrn.b.h | 4 | 2 | 4 | 1 |
xvssrlrn.bu.h | 4 | 2 | 4 | 1 |
xvssrlrn.h.w | 4 | 2 | 4 | 1 |
xvssrlrn.hu.w | 4 | 2 | 4 | 1 |
xvssrlrn.w.d | 4 | 2 | 4 | 1 |
xvssrlrn.wu.d | 4 | 2 | 4 | 1 |
xvssrlrni.b.h | 4 | 2 | 4 | 1 |
xvssrlrni.bu.h | 4 | 2 | 4 | 1 |
xvssrlrni.d.q | 3 | 2 | 3 | 2 |
xvssrlrni.du.q | 3 | 2 | 3 | 2 |
xvssrlrni.h.w | 4 | 2 | 4 | 1 |
xvssrlrni.hu.w | 4 | 2 | 4 | 1 |
xvssrlrni.w.d | 4 | 2 | 4 | 1 |
xvssrlrni.wu.d | 4 | 2 | 4 | 1 |
xvssub.b | 1 | 4 | 1 | 2 |
xvssub.bu | 1 | 4 | 1 | 2 |
xvssub.d | 1 | 4 | 1 | 2 |
xvssub.du | 1 | 4 | 1 | 2 |
xvssub.h | 1 | 4 | 1 | 2 |
xvssub.hu | 1 | 4 | 1 | 2 |
xvssub.w | 1 | 4 | 1 | 2 |
xvssub.wu | 1 | 4 | 1 | 2 |
xvsub.b | 1 | 4 | 1 | 2 |
xvsub.d | 1 | 4 | 1 | 2 |
xvsub.h | 1 | 4 | 1 | 2 |
xvsub.q | 3 | 2 | 3 | 2 |
xvsub.w | 1 | 4 | 1 | 2 |
xvsubi.bu | 1 | 4 | 1 | 2 |
xvsubi.du | 1 | 4 | 1 | 2 |
xvsubi.hu | 1 | 4 | 1 | 2 |
xvsubi.wu | 1 | 4 | 1 | 2 |
xvsubwev.d.w | 2 | 2 | 2 | 2 |
xvsubwev.d.wu | 2 | 2 | 2 | 2 |
xvsubwev.h.b | 2 | 2 | 2 | 2 |
xvsubwev.h.bu | 2 | 2 | 2 | 2 |
xvsubwev.q.d | 3 | 2 | 3 | 2 |
xvsubwev.q.du | 3 | 2 | 3 | 2 |
xvsubwev.w.h | 2 | 2 | 2 | 2 |
xvsubwev.w.hu | 2 | 2 | 2 | 2 |
xvsubwod.d.w | 2 | 2 | 2 | 2 |
xvsubwod.d.wu | 2 | 2 | 2 | 2 |
xvsubwod.h.b | 2 | 2 | 2 | 2 |
xvsubwod.h.bu | 2 | 2 | 2 | 2 |
xvsubwod.q.d | 3 | 2 | 3 | 2 |
xvsubwod.q.du | 3 | 2 | 3 | 2 |
xvsubwod.w.h | 2 | 2 | 2 | 2 |
xvsubwod.w.hu | 2 | 2 | 2 | 2 |
xvxor.v | 1 | 4 | 1 | 2 |
xvxori.b | 1 | 4 | 1 | 2 |
CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vbitseli.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in a
equals to one, copy the bit from imm
to dst
, otherwise copy from b
.
__m128i __lsx_vbitseli_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, 0x12)
+= 0xba8b9aabba8b9a23 0x1216123012031221
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.b vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclr_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.h vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclr_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xf7fff7fff7fff7ff 0x99aabbccddecff00
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.w vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclr_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xfffff7fffffff7ff 0x99aabbccddeeff00
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.d vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclr_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xfffff7ffffffffff 0x99aabbccddeeff00
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.b vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclri_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.h vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclri_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffdfffdfffdfffd 0x99a8bbccddecff00
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.w vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclri_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffffffdfffffffd 0x99aabbccddeeff00
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.d vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitclri_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffffffffffffffd 0x99aabbccddeeff00
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitset_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.b vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitset_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0808080808080808 0x9dbabfdcddeeff02
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitset_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.h vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitset_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0800080008000800 0x99babbdcddeeff02
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitset_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.w vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitset_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0000080000000800 0x99babbccddeeff02
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitset_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.d vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitset_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0000080000000000 0x99aabbceddeeff00
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.b vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitseti_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0202020202020202 0x9baabbcedfeeff02
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.h vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitseti_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0002000200020002 0x99aabbceddeeff02
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.w vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitseti_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0000000200000002 0x99aabbceddeeff02
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.d vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitseti_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0000000000000002 0x99aabbccddeeff02
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.b vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrev_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0707070707070707 0x9dbabfdcd5ecf702
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.h vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrev_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x070f070f070f070f 0x99babbdcddecff02
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.w vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrev_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0f0f070f0f0f070f 0x99babbccddeeff02
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.d vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrev_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.b vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrevi_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.h vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrevi_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.w vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrevi_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.d vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
__m128i __lsx_vbitrevi_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclo_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.b vr, vr
+CPU Flags: LSX
+
+Count leading ones of 8-bit elements in a
.
__m128i __lsx_vclo_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000001 0x0101010202030800
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clo(a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclo_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.h vr, vr
+CPU Flags: LSX
+
+Count leading ones of 16-bit elements in a
.
__m128i __lsx_vclo_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0001000100020008
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = clo(a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclo_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.w vr, vr
+CPU Flags: LSX
+
+Count leading ones of 32-bit elements in a
.
__m128i __lsx_vclo_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0000000100000002
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = clo(a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclo_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.d vr, vr
+CPU Flags: LSX
+
+Count leading ones of 64-bit elements in a
.
__m128i __lsx_vclo_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0000000000000001
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clo(a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.b vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 8-bit elements in a
.
__m128i __lsx_vclz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0302020101010100 0x0000000000000008
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clz(a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.h vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 16-bit elements in a
.
__m128i __lsx_vclz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0003000200010001 0x0000000000000000
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = clz(a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.w vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 32-bit elements in a
.
__m128i __lsx_vclz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000300000001 0x0000000000000000
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = clz(a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vclz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.d vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 64-bit elements in a
.
__m128i __lsx_vclz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000003 0x0000000000000000
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clz(a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vpcnt_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.b vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 8-bit elements in a
.
__m128i __lsx_vpcnt_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0202040204040602 0x0404060406060800
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = popcount(a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vpcnt_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.h vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 16-bit elements in a
.
__m128i __lsx_vpcnt_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0004000600080008 0x0008000a000c0008
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = popcount(a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vpcnt_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.w vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 32-bit elements in a
.
__m128i __lsx_vpcnt_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000a00000010 0x0000001200000014
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = popcount(a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vpcnt_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.d vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 64-bit elements in a
.
__m128i __lsx_vpcnt_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000001a 0x0000000000000026
+
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = popcount(a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
int __lsx_bz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vseteqz.v fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if the whole vector a
equals to zero.
dst = a.qword[0] == 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bnz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetnez.v fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if the whole vector a
is non-zero.
dst = a.qword[0] != 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.b fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 8-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 16; i++) {
+ if (a.byte[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.h fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 16-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 8; i++) {
+ if (a.half[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.w fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 32-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 4; i++) {
+ if (a.word[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.d fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 64-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 2; i++) {
+ if (a.dword[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bnz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.b fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 8-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 16; i++) {
+ if (a.byte[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bnz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.h fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 16-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 8; i++) {
+ if (a.half[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bnz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.w fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 32-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 4; i++) {
+ if (a.word[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
int __lsx_bnz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.d fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 64-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 2; i++) {
+ if (a.dword[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +2 | +
3C5000 | +N/A | +2 | +
__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128 __lsx_vfadd_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfadd.s vr, vr, vr
+CPU Flags: LSX
+
+Add single precision floating point elements in a
to elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m128d __lsx_vfadd_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add double precision floating point elements in a
to elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfdiv.s vr, vr, vr
+CPU Flags: LSX
+
+Divide single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18(1/5.5) | +
3C5000 | +11, 19.5 | +0.13(1/7.5) | +
__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 21.5 | +0.25(1/4) | +
3C5000 | +8, 16.5 | +0.08(1/12.5) | +
__m128 __lsx_vfmax_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of single precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128d __lsx_vfmax_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of double precision floating point elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.s vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.d vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128 __lsx_vfmin_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of single precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128d __lsx_vfmin_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of double precision floating point elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128 __lsx_vfmina_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmina.s vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128d __lsx_vfmina_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmina.d vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128 __lsx_vfmul_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmul.s vr, vr, vr
+CPU Flags: LSX
+
+Multiply single precision floating point elements in a
and elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128d __lsx_vfmul_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmul.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply double precision floating point elements in a
and elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128 __lsx_vfsub_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfsub.s vr, vr, vr
+CPU Flags: LSX
+
+Subtract single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m128d __lsx_vfsub_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfsub.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
3C5000 | +5 | +2 | +
__m128 __lsx_vflogb_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vflogb.s vr, vr
+CPU Flags: LSX
+
+Compute 2-based logarithm of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = log2(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vflogb_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vflogb.d vr, vr
+CPU Flags: LSX
+
+Compute 2-based logarithm of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = log2(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vfsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.s vr, vr
+CPU Flags: LSX
+
+Compute square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.08(1/12) | +
3C5000 | +27 | +0.17(1/6) | +
__m128d __lsx_vfsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.d vr, vr
+CPU Flags: LSX
+
+Compute square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +36 | +0.06(1/17.5) | +
3C5000 | +36 | +0.05(1/18.5) | +
__m128 __lsx_vfrsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.s vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17 | +0.05(1/19) | +
3C5000 | +21 | +0.11(1/9) | +
__m128d __lsx_vfrsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.d vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +15 | +0.04(1/26.5) | +
3C5000 | +15 | +0.04(1/27.5) | +
__m128 __lsx_vfrecip_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecip.s vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1 / a.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18(1/5.5) | +
3C5000 | +27 | +0.14(1/7) | +
__m128d __lsx_vfrecip_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecip.d vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1 / a.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25(1/4) | +
3C5000 | +23 | +0.08(1/12) | +
__m128 __lsx_vfrsqrte_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.s vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+__m128d __lsx_vfrsqrte_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.d vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+__m128 __lsx_vfrecipe_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.s vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+__m128d __lsx_vfrecipe_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.d vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+
+ __m128d __lsx_vfcvth_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvth.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in higher half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[2 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m128d __lsx_vfcvtl_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in lower half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcvt.s.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double precision floating point elements in a
and b
to single precision.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ dst.fp32[i] = b.fp64[i];
+ } else {
+ dst.fp32[i] = a.fp64[i - 2];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m128 __lsx_vfcvth_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvth.s.h vr, vr
+CPU Flags: LSX
+
+Convert half precision floating point elements in higher half of a
to single precision.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp16[4 + i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m128 __lsx_vfcvtl_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.s.h vr, vr
+CPU Flags: LSX
+
+Convert half precision floating point elements in lower half of a
to single precision.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp16[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcvt.h.s vr, vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in a
and b
to half precision.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ dst.fp16[i] = b.fp32[i];
+ } else {
+ dst.fp16[i] = a.fp32[i - 4];
+ }
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +1 | +
__m128d __lsx_vffinth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffinth.d.w vr, vr
+CPU Flags: LSX
+
+Convert 32-bit integer elements in higher part of a
to double precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128d __lsx_vffintl_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffintl.d.w vr, vr
+CPU Flags: LSX
+
+Convert 32-bit integer elements in lower part of a
to double precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128d __lsx_vffint_d_l (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.l vr, vr
+CPU Flags: LSX
+
+Convert signed 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vffint_d_lu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.lu vr, vr
+CPU Flags: LSX
+
+Convert unsigned 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vffint_s_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.w vr, vr
+CPU Flags: LSX
+
+Convert signed 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vffint_s_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.wu vr, vr
+CPU Flags: LSX
+
+Convert unsigned 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vffint_s_l (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vffint.s.l vr, vr, vr
+CPU Flags: LSX
+
+Convert 64-bit integer elements in a
and b
to single-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] =
+ (i < 2) ? (f32)(s32)a.dword[i]
+ : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftinth_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftinth.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrml_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrml.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrmh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrmh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrpl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrpl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrph_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrph.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrzl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrzh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrnel_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrnel.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrneh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrneh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftint_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftint_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrm_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrm.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrm_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrp_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrp.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrp_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrz_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrz_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrne_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrne.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrne_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftint_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.lu.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftint_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.wu.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrz_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.lu.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftintrz_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.wu.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vftint_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftint.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +1 | +
__m128i __lsx_vfclass_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfclass.d vr, vr
+CPU Flags: LSX
+
+Classifiy each double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfclass_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfclass.s vr, vr
+CPU Flags: LSX
+
+Classifiy each single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128 __lsx_vfrint_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrint.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vfrint_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrint.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vfrintrp_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vfrintrp_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vfrintrm_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vfrintrm_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vfrintrz_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vfrintrz_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128 __lsx_vfrintrne_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vfrintrne_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
3C5000 | +5 | +2 | +
__m128i __lsx_vseq_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the 8-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vseq_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the 16-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vseq_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the 32-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vseq_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the 64-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the 8-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the 16-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the 32-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the 64-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslt_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslt_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.bu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslt_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslt_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.hu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslt_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslt_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.wu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslt_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vslt_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.du vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.bu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.hu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.wu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.du vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsle_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsle_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.bu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsle_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsle_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.hu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsle_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsle_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.wu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsle_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsle_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.du vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.bu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.hu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.wu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.du vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+
+Add 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+
+Add 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+
+Add 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+
+Add 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] + b.qword[0];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+
+Add 8-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+
+Add 16-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+
+Add 32-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+
+Add 64-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vavgr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavgr_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavgr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavgr_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavgr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavgr_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vavgr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vavgr_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vdiv_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.b vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 32 | +0.06(1/15.5) | +
3C5000 | +29, 32 | +0.06(1/17) | +
__m128i __lsx_vdiv_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.bu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 33 | +0.06(1/16.5) | +
3C5000 | +29, 36 | +0.06(1/18) | +
__m128i __lsx_vdiv_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.h vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17 | +0.12(1/8.5) | +
3C5000 | +17, 21.5 | +0.09(1/11) | +
__m128i __lsx_vdiv_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.hu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 22 | +0.11(1/9) | +
3C5000 | +17, 21.5 | +0.07(1/14) | +
__m128i __lsx_vdiv_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.w vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18(1/5.5) | +
3C5000 | +11, 17.5 | +0.09(1/11.5) | +
__m128i __lsx_vdiv_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.wu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18(1/5.5) | +
3C5000 | +11, 17.5 | +0.07(1/15) | +
__m128i __lsx_vdiv_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25(1/4) | +
3C5000 | +8, 18.5 | +0.11(1/9) | +
__m128i __lsx_vdiv_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.du vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25(1/4) | +
3C5000 | +8, 18.5 | +0.11(1/9) | +
__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
to even-positioned signed 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
to even-positioned unsigned 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
to even-positioned signed 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
to even-positioned unsigned 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
to even-positioned signed 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
to even-positioned unsigned 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
to even-positioned signed 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
to even-positioned unsigned 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 8-bit elements in a
by even-positioned signed 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 8-bit elements in a
by even-positioned unsigned 8-bit elements in b
to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 16-bit elements in a
by even-positioned signed 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 16-bit elements in a
by even-positioned unsigned 16-bit elements in b
to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 32-bit elements in a
by even-positioned signed 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 32-bit elements in a
by even-positioned unsigned 32-bit elements in b
to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 64-bit elements in a
by even-positioned signed 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 64-bit elements in a
by even-positioned unsigned 64-bit elements in b
to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+ (u32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+ (u64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+ (u128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
3C5000 | +7 | +1.14 | +
__m128i __lsx_vmax_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmax_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmax_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmax_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmax_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmax_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmax_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmax_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmin_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmin_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmin_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmin_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmin_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmin_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmin_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmin_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.b vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 35 | +0.06(1/15.5) | +
3C5000 | +29, 33 | +0.06(1/17) | +
__m128i __lsx_vmod_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.bu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 37 | +0.06(1/17.5) | +
3C5000 | +29, 33 | +0.05(1/19) | +
__m128i __lsx_vmod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.h vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 21 | +0.12(1/8.5) | +
3C5000 | +17, 21 | +0.09(1/11) | +
__m128i __lsx_vmod_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.hu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 21 | +0.11(1/9.5) | +
3C5000 | +17, 21 | +0.07(1/15) | +
__m128i __lsx_vmod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.w vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 13 | +0.18(1/5.5) | +
3C5000 | +11, 15 | +0.08(1/12) | +
__m128i __lsx_vmod_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.wu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 13 | +0.18(1/5.5) | +
3C5000 | +11, 15 | +0.06(1/16) | +
__m128i __lsx_vmod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.d vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25(1/4) | +
3C5000 | +8, 10 | +0.11(1/9.5) | +
__m128i __lsx_vmod_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.du vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25(1/4) | +
3C5000 | +8, 10 | +0.11(1/9.5) | +
__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmuh_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmul_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmul_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] * b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmul_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] * b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmul_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +2 | +
__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
3C5000 | +7 | +2 | +
__m128i __lsx_vneg_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.b vr, vr
+CPU Flags: LSX
+
+Negate 8-bit elements in a
and save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = -a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vneg_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.h vr, vr
+CPU Flags: LSX
+
+Negate 16-bit elements in a
and save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = -a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vneg_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.w vr, vr
+CPU Flags: LSX
+
+Negate 32-bit elements in a
and save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = -a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vneg_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.d vr, vr
+CPU Flags: LSX
+
+Negate 64-bit elements in a
and save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = -a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.b vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.bu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.h vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.hu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.w vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.wu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.d vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsadd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.du vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.b vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.bu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.h vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.hu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.w vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.wu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.d vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vssub_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.du vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] - b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] - b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsub_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.q vr, vr, vr
+CPU Flags: LSX
+
+Subtract 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] - b.qword[0];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.bu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 8-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.hu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 16-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.wu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 32-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.du vr, vr, imm
+CPU Flags: LSX
+
+Subtract 64-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] - imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ANDN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vnor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vnor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vnori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vorn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vorn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ORN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vxor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vxor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vxori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+
+Read whole vector from memory address addr + offset
, save the data into dst
. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldx (void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+
+Read whole vector from memory address addr + offset
, save the data into dst
. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+
+Read 8-bit data from memory address addr + (offset << 0)
, replicate the data to all vector lanes and save into dst
.
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = data;
+}
+
+__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+
+Read 16-bit data from memory address addr + (offset << 1)
, replicate the data to all vector lanes and save into dst
.
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = data;
+}
+
+__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+
+Read 32-bit data from memory address addr + (offset << 2)
, replicate the data to all vector lanes and save into dst
.
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = data;
+}
+
+__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+
+Read 64-bit data from memory address addr + (offset << 3)
, replicate the data to all vector lanes and save into dst
.
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = data;
+}
+
+void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vst vr, r, imm
+CPU Flags: LSX
+
+Write whole vector data in data
to memory address addr + offset
.
memory_store(128, data, addr + offset);
+
+void __lsx_vstx (__m128i data, void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vstx vr, r, r
+CPU Flags: LSX
+
+Write whole-vector data in data
to memory address addr + offset
.
memory_store(128, data, addr + offset);
+
+void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.b vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 8-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(8, data.byte[lane], addr + offset);
+
+void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.h vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 16-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(16, data.half[lane], addr + offset);
+
+void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.w vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 32-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(32, data.word[lane], addr + offset);
+
+void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.d vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 64-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(64, data.dword[lane], addr + offset);
+
+
+ __m128i __lsx_vexth_h_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.h.b vr, vr
+CPU Flags: LSX
+
+Extend signed 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vexth_hu_bu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.hu.bu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vexth_w_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.w.h vr, vr
+CPU Flags: LSX
+
+Extend signed 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vexth_wu_hu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.wu.hu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vexth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.d.w vr, vr
+CPU Flags: LSX
+
+Extend signed 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vexth_du_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.du.wu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vexth_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vexth_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vextl_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vextl_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.b vr, vr, imm
+CPU Flags: LSX
+
+Extract one 8-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.h vr, vr, imm
+CPU Flags: LSX
+
+Extract one 16-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.w vr, vr, imm
+CPU Flags: LSX
+
+Extract one 32-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.d vr, vr, imm
+CPU Flags: LSX
+
+Extract one 64-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in higher half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in higher half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in higher half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in higher half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in lower half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in lower half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in lower half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vilvl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in lower half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.b vr, r, imm
+CPU Flags: LSX
+
+Insert 8-bit element into lane indexed imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == imm) ? b : a.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.h vr, r, imm
+CPU Flags: LSX
+
+Insert 16-bit element into lane indexed imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == imm) ? b : a.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.w vr, r, imm
+CPU Flags: LSX
+
+Insert 32-bit element into lane indexed imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.d vr, r, imm
+CPU Flags: LSX
+
+Insert 64-bit element into lane indexed imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.b vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[c.byte[0] % 16] = i;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.h vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[c.half[0] % 8] = i;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.b vr, vr, imm
+CPU Flags: LSX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[imm % 16] = i;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.h vr, vr, imm
+CPU Flags: LSX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[imm % 8] = i;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vmskgez_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskgez.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is greater than or equal to zero, set one bit in dst
, otherwise clear it.
__m128i __lsx_vmskgez_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x00000000000001fe 0x0000000000000000
+__m128i __lsx_vmskgez_b(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x000000000000b7cf 0x0000000000000000
+
+u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmskltz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m128i __lsx_vmskltz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000fe01 0x0000000000000000
+__m128i __lsx_vmskltz_b(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000004830 0x0000000000000000
+
+u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmskltz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.h vr, vr
+CPU Flags: LSX
+
+For each 16-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m128i __lsx_vmskltz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x00000000000000f0 0x0000000000000000
+__m128i __lsx_vmskltz_h(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000024 0x0000000000000000
+
+u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmskltz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.w vr, vr
+CPU Flags: LSX
+
+For each 32-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m128i __lsx_vmskltz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000000c 0x0000000000000000
+__m128i __lsx_vmskltz_w(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000004 0x0000000000000000
+
+u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmskltz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.d vr, vr
+CPU Flags: LSX
+
+For each 64-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
__m128i __lsx_vmskltz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000002 0x0000000000000000
+__m128i __lsx_vmskltz_d(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000000 0x0000000000000000
+
+u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vmsknz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmsknz.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is non-zero, set one bit in dst
, otherwise clear it.
__m128i __lsx_vmsknz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000feff 0x0000000000000000
+__m128i __lsx_vmsknz_b(__m128i{0x0000111100000000, 0x0011000011111111})
+= 0x0000000000004f30 0x0000000000000000
+
+u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.b vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.h vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.w vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.d vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.b vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.h vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.w vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpackod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.d vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpickev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.b vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 8-bit elements in b
first, then pick even-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpickev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.h vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 16-bit elements in b
first, then pick even-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpickev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.w vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 32-bit elements in b
first, then pick even-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpickev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.d vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 64-bit elements in b
first, then pick even-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.b r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s8)a.byte[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.bu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u8)a.byte[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.h r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s16)a.half[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.hu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u16)a.half[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.w r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s32)a.word[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.wu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u32)a.word[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.d r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s64)a.dword[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.du r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u64)a.dword[idx];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vpickod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.b vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 8-bit elements in b
first, then pick odd-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpickod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.h vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 16-bit elements in b
first, then pick odd-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpickod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.w vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 32-bit elements in b
first, then pick odd-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vpickod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.d vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 64-bit elements in b
first, then pick odd-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vrepli_b (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_h (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_w (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_d (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vreplgr2vr_b (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.b vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m128i __lsx_vreplgr2vr_h (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.h vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m128i __lsx_vreplgr2vr_w (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.w vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m128i __lsx_vreplgr2vr_d (long int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.d vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = val;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +N/A | +1 | +
3C5000 | +N/A | +1 | +
__m128i __lsx_vreplve_b (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.b vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx % 16];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vreplve_h (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.h vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx % 8];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vreplve_w (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.w vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx % 4];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vreplve_d (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.d vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx % 2];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
3C5000 | +1 | +1 | +
__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.b vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.h vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.w vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.d vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.b vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.bu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.h vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.hu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.w vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.wu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.d vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.du vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.b vr, vr, vr
+CPU Flags: LSX
+
+If the 8-bit element in a
equals to zero, set the result to zero. If the signed 8-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.h vr, vr, vr
+CPU Flags: LSX
+
+If the 16-bit element in a
equals to zero, set the result to zero. If the signed 16-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.w vr, vr, vr
+CPU Flags: LSX
+
+If the 32-bit element in a
equals to zero, set the result to zero. If the signed 32-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.d vr, vr, vr
+CPU Flags: LSX
+
+If the 64-bit element in a
equals to zero, set the result to zero. If the signed 64-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vldi (imm_n1024_1023 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Initialize dst
using predefined patterns:
imm[12:10]=0b000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:10]=0b001
: broadcast sign-extended imm[9:0]
as 16-bit elements to all lanesimm[12:10]=0b010
: broadcast sign-extended imm[9:0]
as 32-bit elements to all lanesimm[12:10]=0b011
: broadcast sign-extended imm[9:0]
as 64-bit elements to all lanesimm[12:8]=0b10000
: broadcast imm[7:0]
as 32-bit elements to all lanesimm[12:8]=0b10001
: broadcast imm[7:0] << 8
as 32-bit elements to all lanesimm[12:8]=0b10010
: broadcast imm[7:0] << 16
as 32-bit elements to all lanesimm[12:8]=0b10011
: broadcast imm[7:0] << 24
as 32-bit elements to all lanesimm[12:8]=0b10100
: broadcast imm[7:0]
as 16-bit elements to all lanesimm[12:8]=0b10101
: broadcast imm[7:0] << 8
as 16-bit elements to all lanesimm[12:8]=0b10110
: broadcast (imm[7:0] << 8) | 0xFF
as 32-bit elements to all lanesimm[12:8]=0b10111
: broadcast (imm[7:0] << 16) | 0xFFFF
as 32-bit elements to all lanesimm[12:8]=0b11000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:8]=0b11001
: repeat each bit of imm[7:0]
eight times, and broadcast the result as 64-bit elements to all lanesimm[12:8]=0b11010
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 32-bit elements to all lanesimm[12:8]=0b11011
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 64-bit elements to all lanesimm[12:8]=0b11100
: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)
as 64-bit elements to all lanesu64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+ broadcast_value = simm9_0;
+ broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+ broadcast_value = simm9_0;
+ broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+ broadcast_value = simm9_0;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+ broadcast_value = imm7_0 << 16;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+ broadcast_value = imm7_0 << 24;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+ broadcast_value = imm7_0;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+ broadcast_value = (imm7_0 << 8) | 0xFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+ broadcast_value = (imm7_0 << 16) | 0xFFFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+ broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+ imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+ imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+ imm7 * 0xFF00000000000000;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+ broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+ (imm5_0 << 48);
+ broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+ for (int i = 0; i < 16; i++) {
+ dst.byte[i] = broadcast_value;
+ }
+} else if (broadcast_width == 16) {
+ for (int i = 0; i < 8; i++) {
+ dst.half[i] = broadcast_value;
+ }
+} else if (broadcast_width == 32) {
+ for (int i = 0; i < 4; i++) {
+ dst.word[i] = broadcast_value;
+ }
+} else if (broadcast_width == 64) {
+ for (int i = 0; i < 2; i++) {
+ dst.dword[i] = broadcast_value;
+ }
+}
+
+Tested on real machine.
+ +__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+
+Permute words from a
and b
with indices recorded in imm
and store into dst
.
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsll.v vr, vr, imm
+CPU Flags: LSX
+
+Compute whole vector a
shifted left by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsrl.v vr, vr, imm
+CPU Flags: LSX
+
+Compute whole vector a
shifted right by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsll_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.b vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsll_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.h vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsll_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.w vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsll_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.d vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vslli.b vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vslli.h vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslli.w vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vslli.d vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.h.b vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 8-bit elements in a
by imm
to signed 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.hu.bu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 8-bit elements in a
by imm
to unsigned 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.w.h vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 16-bit elements in a
by imm
to signed 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.wu.hu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 16-bit elements in a
by imm
to unsigned 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.d.w vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 32-bit elements in a
by imm
to signed 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.du.wu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 32-bit elements in a
by imm
to unsigned 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsra_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.b vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsra_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsra_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsra_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.b vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+ : (s32)((s64)a.dword[i - 2] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+ : (s64)((s128)a.qword[i - 1] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrar_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.b vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+ (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrar_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+ (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrar_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+ (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrar_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.b vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] =
+ ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] =
+ ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] =
+ ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+ (((s16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (s16)(s32)a.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+ (((s32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+ (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+ (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)b.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+ (((s32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+ (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)b.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+ (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)b.qword[i];
+ } else {
+ dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+ (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.b vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.b vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] >> imm;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
3C5000 | +2 | +1 | +
__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+ : (u32)((u64)a.dword[i - 2] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+ : (u64)((u128)a.qword[i - 1] >> imm);
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.b vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+ ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+ ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+ ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+ ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.b vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+ (((u16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (u16)(u32)a.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+ (((u32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+ (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+ (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)b.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+ (((u32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+ (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)b.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+ (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)b.qword[i];
+ } else {
+ dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+ (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.du.q vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
3C5000 | +4 | +1 | +
__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
3C5000 | +3 | +2 | +
__m128i __lsx_vrotr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.b vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vrotr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.h vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+ (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vrotr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.w vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+ (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vrotr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.d vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+ (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.b vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.h vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.w vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.d vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +2 | +2 | +
__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+
+Shuffle bytes from a
and b
with indices from c
.
Caveat: the indices are placed in c
, while in other vshuf
intrinsics, they are placed in a
.
__m128i __lsx_vshuf_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, __m128i{0x0011021304050607, 0x0811120213031404})
+= 0x7877155513efcdab 0x2177661555144413
+
+for (int i = 0; i < 16; i++) {
+ if (c.byte[i] >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.byte[i] = 0;
+ } else if ((c.byte[i] % 32) < 16) {
+ dst.byte[i] = b.byte[c.byte[i] % 16];
+ } else {
+ dst.byte[i] = a.byte[c.byte[i] % 16];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 16-bit elements in b
and c
with indices from a
, save the result to dst
.
__m128i __lsx_vshuf_h(__m128i{0x0001000200030004, 0x0005000a000b000c}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x1415ef13abcd4321 0x432133441122ff00
+
+for (int i = 0; i < 8; i++) {
+ if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.half[i] = 0;
+ } else if ((a.half[i] % 16) < 8) {
+ dst.half[i] = c.half[a.half[i] % 8];
+ } else {
+ dst.half[i] = b.half[a.half[i] % 8];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 32-bit elements in b
and c
with indices from a
, save the result to dst
.
__m128i __lsx_vshuf_w(__m128i{0x0000000200000004, 0x0000000700000005}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x4321432155667788 0x99aabbcc11223344
+
+for (int i = 0; i < 4; i++) {
+ if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.word[i] = 0;
+ } else if ((a.word[i] % 8) < 4) {
+ dst.word[i] = c.word[a.word[i] % 4];
+ } else {
+ dst.word[i] = b.word[a.word[i] % 4];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 64-bit elements in b
and c
with indices from a
, save the result to dst
.
__m128i __lsx_vshuf_d(__m128i{0x0000000000000001, 0x0000000000000002}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x1234123443214321 0x1122334455667788
+
+for (int i = 0; i < 2; i++) {
+ if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.dword[i] = 0;
+ } else if ((a.dword[i] % 4) < 2) {
+ dst.dword[i] = c.dword[a.dword[i] % 2];
+ } else {
+ dst.dword[i] = b.dword[a.dword[i] % 2];
+ }
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.b vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 8-bit elements in a
with indices packed in imm
, save the result to dst
.
__m128i __lsx_vshuf4i_b(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x13ef13cd78667815 0x3412343421432121
+
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.h vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 16-bit elements in a
with indices packed in imm
, save the result to dst
.
__m128i __lsx_vshuf4i_h(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x667814156678ef13 0x4321432143211234
+
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.w vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 32-bit elements in a
with indices packed in imm
, save the result to dst
.
__m128i __lsx_vshuf4i_w(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x1415667843214321 0x14156678abcdef13
+
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.d vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 64-bit elements in a
and b
with indices packed in imm
, save the result to dst
.
__m128i __lsx_vshuf4i_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0xabcdef1314156678 0x1122334455667788
+
+dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+ (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+
+Tested on real machine.
+CPU | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
3C5000 | +1 | +2 | +
AVX is a 256-bit SIMD extension to X86. It is possible to migrate existing AVX code to leverage LoongArch LASX extension by rewriting the intrinsics or instructions manually, or by using tools like SIMD Everywhere to implement AVX intrinsics with LASX counterparts. But to unleash the full performance, you may want to port your code to LASX manually.
+Thankfully, LASX intrinsics adopt the same type as AVX: you can use the following familiar types for SIMD:
+Here is a table of a mapping from AVX intrinsics to their LASX counterpart (WIP):
+AVX | +LASX | +
---|---|
_mm256_abs_epi16 | +__lasx_xvsigncov_h | +
_mm256_abs_epi32 | +__lasx_xvsigncov_w | +
_mm256_abs_epi8 | +__lasx_xvsigncov_b | +
_mm256_add_epi16 | +__lasx_xvadd_h | +
_mm256_add_epi32 | +__lasx_xvadd_w | +
_mm256_add_epi64 | +__lasx_xvadd_d | +
_mm256_add_epi8 | +__lasx_xvadd_b | +
_mm256_add_pd | +__lasx_xvfadd_d | +
_mm256_add_ps | +__lasx_xvfadd_s | +
_mm256_adds_epi16 | +__lasx_xvsadd_h | +
_mm256_adds_epi8 | +__lasx_xvsadd_b | +
_mm256_adds_epu16 | +__lasx_xvsadd_hu | +
_mm256_adds_epu8 | +__lasx_xvsadd_bu | +
_mm256_addsub_pd | ++ |
_mm256_addsub_ps | ++ |
_mm256_alignr_epi8 | ++ |
_mm256_and_pd | +__lasx_xvand_v | +
_mm256_and_ps | +__lasx_xvand_v | +
_mm256_and_si256 | +__lasx_xvand_v | +
_mm256_andnot_pd | +__lasx_xvandn_v | +
_mm256_andnot_ps | +__lasx_xvandn_v | +
_mm256_andnot_si256 | +__lasx_xvandn_v | +
_mm256_avg_epu16 | +__lasx_xvavgr_hu | +
_mm256_avg_epu8 | +__lasx_xvavgr_bu | +
_mm256_bcstnebf16_ps | ++ |
_mm256_bcstnesh_ps | ++ |
_mm256_blend_epi16 | ++ |
_mm256_blend_epi32 | ++ |
_mm256_blend_pd | ++ |
_mm256_blend_ps | ++ |
_mm256_blendv_epi8 | ++ |
_mm256_blendv_pd | ++ |
_mm256_blendv_ps | ++ |
_mm256_broadcast_pd | ++ |
_mm256_broadcast_ps | ++ |
_mm256_broadcast_sd | ++ |
_mm256_broadcast_ss | ++ |
_mm256_broadcastb_epi8 | ++ |
_mm256_broadcastd_epi32 | ++ |
_mm256_broadcastq_epi64 | ++ |
_mm256_broadcastsd_pd | ++ |
_mm256_broadcastsi128_si256 | ++ |
_mm256_broadcastss_ps | ++ |
_mm256_broadcastw_epi16 | ++ |
_mm256_bslli_epi128 | ++ |
_mm256_bsrli_epi128 | ++ |
_mm256_castpd128_pd256 | ++ |
_mm256_castpd256_pd128 | ++ |
_mm256_castpd_ps | ++ |
_mm256_castpd_si256 | ++ |
_mm256_castps128_ps256 | ++ |
_mm256_castps256_ps128 | ++ |
_mm256_castps_pd | ++ |
_mm256_castps_si256 | ++ |
_mm256_castsi128_si256 | ++ |
_mm256_castsi256_pd | ++ |
_mm256_castsi256_ps | ++ |
_mm256_castsi256_si128 | ++ |
_mm256_ceil_pd | ++ |
_mm256_ceil_ps | ++ |
_mm256_cmp_pd | ++ |
_mm256_cmp_ps | ++ |
_mm256_cmpeq_epi16 | ++ |
_mm256_cmpeq_epi32 | ++ |
_mm256_cmpeq_epi64 | ++ |
_mm256_cmpeq_epi8 | ++ |
_mm256_cmpgt_epi16 | ++ |
_mm256_cmpgt_epi32 | ++ |
_mm256_cmpgt_epi64 | ++ |
_mm256_cmpgt_epi8 | ++ |
_mm256_cvtepi16_epi32 | ++ |
_mm256_cvtepi16_epi64 | ++ |
_mm256_cvtepi32_epi64 | ++ |
_mm256_cvtepi32_pd | ++ |
_mm256_cvtepi32_ps | ++ |
_mm256_cvtepi8_epi16 | ++ |
_mm256_cvtepi8_epi32 | ++ |
_mm256_cvtepi8_epi64 | ++ |
_mm256_cvtepu16_epi32 | ++ |
_mm256_cvtepu16_epi64 | ++ |
_mm256_cvtepu32_epi64 | ++ |
_mm256_cvtepu8_epi16 | ++ |
_mm256_cvtepu8_epi32 | ++ |
_mm256_cvtepu8_epi64 | ++ |
_mm256_cvtneebf16_ps | ++ |
_mm256_cvtneeph_ps | ++ |
_mm256_cvtneobf16_ps | ++ |
_mm256_cvtneoph_ps | ++ |
_mm256_cvtneps_avx_pbh | ++ |
_mm256_cvtneps_pbh | ++ |
_mm256_cvtpd_epi32 | ++ |
_mm256_cvtpd_ps | ++ |
_mm256_cvtph_ps | ++ |
_mm256_cvtps_epi32 | ++ |
_mm256_cvtps_pd | ++ |
_mm256_cvtps_ph | ++ |
_mm256_cvtsd_f64 | ++ |
_mm256_cvtsi256_si32 | ++ |
_mm256_cvtss_f32 | ++ |
_mm256_cvttpd_epi32 | ++ |
_mm256_cvttps_epi32 | ++ |
_mm256_div_pd | +__lasx_xvfdiv_d | +
_mm256_div_ps | +__lasx_xvfdiv_s | +
_mm256_dp_ps | ++ |
_mm256_dpbssd_epi32 | ++ |
_mm256_dpbssds_epi32 | ++ |
_mm256_dpbsud_epi32 | ++ |
_mm256_dpbsuds_epi32 | ++ |
_mm256_dpbusd_avx_epi32 | ++ |
_mm256_dpbusd_epi32 | ++ |
_mm256_dpbusds_avx_epi32 | ++ |
_mm256_dpbusds_epi32 | ++ |
_mm256_dpbuud_epi32 | ++ |
_mm256_dpbuuds_epi32 | ++ |
_mm256_dpwssd_avx_epi32 | ++ |
_mm256_dpwssd_epi32 | ++ |
_mm256_dpwssds_avx_epi32 | ++ |
_mm256_dpwssds_epi32 | ++ |
_mm256_dpwsud_epi32 | ++ |
_mm256_dpwsuds_epi32 | ++ |
_mm256_dpwusd_epi32 | ++ |
_mm256_dpwusds_epi32 | ++ |
_mm256_dpwuud_epi32 | ++ |
_mm256_dpwuuds_epi32 | ++ |
_mm256_extract_epi16 | ++ |
_mm256_extract_epi32 | ++ |
_mm256_extract_epi64 | ++ |
_mm256_extract_epi8 | ++ |
_mm256_extractf128_pd | ++ |
_mm256_extractf128_ps | ++ |
_mm256_extractf128_si256 | ++ |
_mm256_extracti128_si256 | ++ |
_mm256_floor_pd | ++ |
_mm256_floor_ps | ++ |
_mm256_fmadd_pd | ++ |
_mm256_fmadd_ps | ++ |
_mm256_fmaddsub_pd | ++ |
_mm256_fmaddsub_ps | ++ |
_mm256_fmsub_pd | ++ |
_mm256_fmsub_ps | ++ |
_mm256_fmsubadd_pd | ++ |
_mm256_fmsubadd_ps | ++ |
_mm256_fnmadd_pd | ++ |
_mm256_fnmadd_ps | ++ |
_mm256_fnmsub_pd | ++ |
_mm256_fnmsub_ps | ++ |
_mm256_hadd_epi16 | ++ |
_mm256_hadd_epi32 | ++ |
_mm256_hadd_pd | ++ |
_mm256_hadd_ps | ++ |
_mm256_hadds_epi16 | ++ |
_mm256_hsub_epi16 | ++ |
_mm256_hsub_epi32 | ++ |
_mm256_hsub_pd | ++ |
_mm256_hsub_ps | ++ |
_mm256_hsubs_epi16 | ++ |
_mm256_i32gather_epi32 | ++ |
_mm256_i32gather_epi64 | ++ |
_mm256_i32gather_pd | ++ |
_mm256_i32gather_ps | ++ |
_mm256_i64gather_epi32 | ++ |
_mm256_i64gather_epi64 | ++ |
_mm256_i64gather_pd | ++ |
_mm256_i64gather_ps | ++ |
_mm256_insert_epi16 | ++ |
_mm256_insert_epi32 | ++ |
_mm256_insert_epi64 | ++ |
_mm256_insert_epi8 | ++ |
_mm256_insertf128_pd | ++ |
_mm256_insertf128_ps | ++ |
_mm256_insertf128_si256 | ++ |
_mm256_inserti128_si256 | ++ |
_mm256_lddqu_si256 | ++ |
_mm256_load_pd | ++ |
_mm256_load_ps | ++ |
_mm256_load_si256 | ++ |
_mm256_loadu2_m128d | ++ |
_mm256_loadu2_m128i | ++ |
_mm256_loadu2_m128 | ++ |
_mm256_loadu_pd | ++ |
_mm256_loadu_ps | ++ |
_mm256_loadu_si256 | ++ |
_mm256_madd52hi_avx_epu64 | ++ |
_mm256_madd52hi_epu64 | ++ |
_mm256_madd52lo_avx_epu64 | ++ |
_mm256_madd52lo_epu64 | ++ |
_mm256_madd_epi16 | ++ |
_mm256_maddubs_epi16 | ++ |
_mm256_mask_i32gather_epi32 | ++ |
_mm256_mask_i32gather_epi64 | ++ |
_mm256_mask_i32gather_pd | ++ |
_mm256_mask_i32gather_ps | ++ |
_mm256_mask_i64gather_epi32 | ++ |
_mm256_mask_i64gather_epi64 | ++ |
_mm256_mask_i64gather_pd | ++ |
_mm256_mask_i64gather_ps | ++ |
_mm256_maskload_epi32 | ++ |
_mm256_maskload_epi64 | ++ |
_mm256_maskload_pd | ++ |
_mm256_maskload_ps | ++ |
_mm256_maskstore_epi32 | ++ |
_mm256_maskstore_epi64 | ++ |
_mm256_maskstore_pd | ++ |
_mm256_maskstore_ps | ++ |
_mm256_max_epi16 | +__lasx_xvmax_h | +
_mm256_max_epi32 | +__lasx_xvmax_w | +
_mm256_max_epi8 | +__lasx_xvmax_b | +
_mm256_max_epu16 | +__lasx_xvmax_hu | +
_mm256_max_epu32 | +__lasx_xvmax_wu | +
_mm256_max_epu8 | +__lasx_xvmax_bu | +
_mm256_max_pd | +__lasx_xvfmax_d | +
_mm256_max_ps | +__lasx_xvfmax_s | +
_mm256_min_epi16 | +__lasx_xvmin_h | +
_mm256_min_epi32 | +__lasx_xvmin_w | +
_mm256_min_epi8 | +__lasx_xvmin_b | +
_mm256_min_epu16 | +__lasx_xvmin_hu | +
_mm256_min_epu32 | +__lasx_xvmin_wu | +
_mm256_min_epu8 | +__lasx_xvmin_bu | +
_mm256_min_pd | +__lasx_xvfmin_d | +
_mm256_min_ps | +__lasx_xvfmin_s | +
_mm256_movedup_pd | ++ |
_mm256_movehdup_ps | ++ |
_mm256_moveldup_ps | ++ |
_mm256_movemask_epi8 | ++ |
_mm256_movemask_pd | ++ |
_mm256_movemask_ps | ++ |
_mm256_mpsadbw_epu8 | ++ |
_mm256_mul_epi32 | ++ |
_mm256_mul_epu32 | ++ |
_mm256_mul_pd | +__lasx_xvfmul_d | +
_mm256_mul_ps | +__lasx_xvfmul_s | +
_mm256_mulhi_epi16 | ++ |
_mm256_mulhi_epu16 | ++ |
_mm256_mulhrs_epi16 | ++ |
_mm256_mullo_epi16 | ++ |
_mm256_mullo_epi32 | ++ |
_mm256_or_pd | ++ |
_mm256_or_ps | ++ |
_mm256_or_si256 | +__lasx_xvor_v | +
_mm256_packs_epi16 | ++ |
_mm256_packs_epi32 | ++ |
_mm256_packus_epi16 | ++ |
_mm256_packus_epi32 | ++ |
_mm256_permute2f128_pd | ++ |
_mm256_permute2f128_ps | ++ |
_mm256_permute2f128_si256 | ++ |
_mm256_permute2x128_si256 | ++ |
_mm256_permute4x64_epi64 | ++ |
_mm256_permute4x64_pd | ++ |
_mm256_permute_pd | ++ |
_mm256_permute_ps | ++ |
_mm256_permutevar8x32_epi32 | ++ |
_mm256_permutevar8x32_ps | ++ |
_mm256_permutevar_pd | ++ |
_mm256_permutevar_ps | ++ |
_mm256_rcp_ps | ++ |
_mm256_round_pd | ++ |
_mm256_round_ps | ++ |
_mm256_rsqrt_ps | ++ |
_mm256_sad_epu8 | ++ |
_mm256_set1_epi16 | ++ |
_mm256_set1_epi32 | ++ |
_mm256_set1_epi64x | ++ |
_mm256_set1_epi8 | ++ |
_mm256_set1_pd | ++ |
_mm256_set1_ps | ++ |
_mm256_set_epi16 | ++ |
_mm256_set_epi32 | ++ |
_mm256_set_epi64x | ++ |
_mm256_set_epi8 | ++ |
_mm256_set_m128d | ++ |
_mm256_set_m128i | ++ |
_mm256_set_m128 | ++ |
_mm256_set_pd | ++ |
_mm256_set_ps | ++ |
_mm256_setr_epi16 | ++ |
_mm256_setr_epi32 | ++ |
_mm256_setr_epi64x | ++ |
_mm256_setr_epi8 | ++ |
_mm256_setr_m128d | ++ |
_mm256_setr_m128i | ++ |
_mm256_setr_m128 | ++ |
_mm256_setr_pd | ++ |
_mm256_setr_ps | ++ |
_mm256_setzero_pd | ++ |
_mm256_setzero_ps | ++ |
_mm256_setzero_si256 | ++ |
_mm256_sha512msg1_epi64 | ++ |
_mm256_sha512msg2_epi64 | ++ |
_mm256_sha512rnds2_epi64 | ++ |
_mm256_shuffle_epi32 | ++ |
_mm256_shuffle_epi8 | ++ |
_mm256_shuffle_pd | ++ |
_mm256_shuffle_ps | ++ |
_mm256_shufflehi_epi16 | ++ |
_mm256_shufflelo_epi16 | ++ |
_mm256_sign_epi16 | ++ |
_mm256_sign_epi32 | ++ |
_mm256_sign_epi8 | ++ |
_mm256_sll_epi16 | +__lasx_xvsll_h | +
_mm256_sll_epi32 | +__lasx_xvsll_w | +
_mm256_sll_epi64 | +__lasx_xvsll_d | +
_mm256_slli_epi16 | +__lasx_xvslli_h | +
_mm256_slli_epi32 | +__lasx_xvslli_w | +
_mm256_slli_epi64 | +__lasx_xvslli_d | +
_mm256_slli_si256 | ++ |
_mm256_sllv_epi32 | ++ |
_mm256_sllv_epi64 | ++ |
_mm256_sm4key4_epi32 | ++ |
_mm256_sm4rnds4_epi32 | ++ |
_mm256_sqrt_pd | ++ |
_mm256_sqrt_ps | ++ |
_mm256_sra_epi16 | +__lasx_xvsra_h | +
_mm256_sra_epi32 | +__lasx_xvsra_w | +
_mm256_srai_epi16 | +__lasx_xvsrai_h | +
_mm256_srai_epi32 | +__lasx_xvsrai_w | +
_mm256_srav_epi32 | ++ |
_mm256_srl_epi16 | +__lasx_xvsrl_h | +
_mm256_srl_epi32 | +__lasx_xvsrl_w | +
_mm256_srl_epi64 | +__lasx_xvsrl_d | +
_mm256_srli_epi16 | +__lasx_xvsrli_h | +
_mm256_srli_epi32 | +__lasx_xvsrli_w | +
_mm256_srli_epi64 | +__lasx_xvsrli_d | +
_mm256_srli_si256 | ++ |
_mm256_srlv_epi32 | ++ |
_mm256_srlv_epi64 | ++ |
_mm256_store_pd | ++ |
_mm256_store_ps | ++ |
_mm256_store_si256 | ++ |
_mm256_storeu2_m128d | ++ |
_mm256_storeu2_m128i | ++ |
_mm256_storeu2_m128 | ++ |
_mm256_storeu_pd | ++ |
_mm256_storeu_ps | ++ |
_mm256_storeu_si256 | ++ |
_mm256_stream_load_si256 | ++ |
_mm256_stream_pd | ++ |
_mm256_stream_ps | ++ |
_mm256_stream_si256 | ++ |
_mm256_sub_epi16 | +__lasx_xvsub_h | +
_mm256_sub_epi32 | +__lasx_xvsub_w | +
_mm256_sub_epi64 | +__lasx_xvsub_d | +
_mm256_sub_epi8 | +__lasx_xvsub_b | +
_mm256_sub_pd | +__lasx_xvfsub_d | +
_mm256_sub_ps | +__lasx_xvfsub_s | +
_mm256_subs_epi16 | ++ |
_mm256_subs_epi8 | ++ |
_mm256_subs_epu16 | ++ |
_mm256_subs_epu8 | ++ |
_mm256_testc_pd | ++ |
_mm256_testc_ps | ++ |
_mm256_testc_si256 | ++ |
_mm256_testnzc_pd | ++ |
_mm256_testnzc_ps | ++ |
_mm256_testnzc_si256 | ++ |
_mm256_testz_pd | ++ |
_mm256_testz_ps | ++ |
_mm256_testz_si256 | ++ |
_mm256_undefined_pd | ++ |
_mm256_undefined_ps | ++ |
_mm256_undefined_si256 | ++ |
_mm256_unpackhi_epi16 | +__lasx_xvilvh_h | +
_mm256_unpackhi_epi32 | +__lasx_xvilvh_w | +
_mm256_unpackhi_epi64 | +__lasx_xvilvh_d | +
_mm256_unpackhi_epi8 | +__lasx_xvilvh_b | +
_mm256_unpackhi_pd | ++ |
_mm256_unpackhi_ps | ++ |
_mm256_unpacklo_epi16 | +__lasx_xvilvl_h | +
_mm256_unpacklo_epi32 | +__lasx_xvilvl_w | +
_mm256_unpacklo_epi64 | +__lasx_xvilvl_d | +
_mm256_unpacklo_epi8 | +__lasx_xvilvl_b | +
_mm256_unpacklo_pd | ++ |
_mm256_unpacklo_ps | ++ |
_mm256_xor_pd | ++ |
_mm256_xor_ps | ++ |
_mm256_xor_si256 | ++ |
_mm256_zeroall | ++ |
_mm256_zeroupper | ++ |
_mm256_zextpd128_pd256 | ++ |
_mm256_zextps128_ps256 | ++ |
_mm256_zextsi128_si256 | ++ |
_mm_bcstnebf16_ps | ++ |
_mm_bcstnesh_ps | ++ |
_mm_blend_epi32 | ++ |
_mm_broadcast_ss | ++ |
_mm_broadcastb_epi8 | ++ |
_mm_broadcastd_epi32 | ++ |
_mm_broadcastq_epi64 | ++ |
_mm_broadcastsd_pd | ++ |
_mm_broadcastsi128_si256 | ++ |
_mm_broadcastss_ps | ++ |
_mm_broadcastw_epi16 | ++ |
_mm_cmp_pd | ++ |
_mm_cmp_ps | ++ |
_mm_cmp_sd | ++ |
_mm_cmp_ss | ++ |
_mm_cvtneebf16_ps | ++ |
_mm_cvtneeph_ps | ++ |
_mm_cvtneobf16_ps | ++ |
_mm_cvtneoph_ps | ++ |
_mm_cvtneps_avx_pbh | ++ |
_mm_cvtneps_pbh | ++ |
_mm_cvtph_ps | ++ |
_mm_cvtps_ph | ++ |
_mm_dpbssd_epi32 | ++ |
_mm_dpbssds_epi32 | ++ |
_mm_dpbsud_epi32 | ++ |
_mm_dpbsuds_epi32 | ++ |
_mm_dpbusd_avx_epi32 | ++ |
_mm_dpbusd_epi32 | ++ |
_mm_dpbusds_avx_epi32 | ++ |
_mm_dpbusds_epi32 | ++ |
_mm_dpbuud_epi32 | ++ |
_mm_dpbuuds_epi32 | ++ |
_mm_dpwssd_avx_epi32 | ++ |
_mm_dpwssd_epi32 | ++ |
_mm_dpwssds_avx_epi32 | ++ |
_mm_dpwssds_epi32 | ++ |
_mm_dpwsud_epi32 | ++ |
_mm_dpwsuds_epi32 | ++ |
_mm_dpwusd_epi32 | ++ |
_mm_dpwusds_epi32 | ++ |
_mm_dpwuud_epi32 | ++ |
_mm_dpwuuds_epi32 | ++ |
_mm_fmadd_pd | ++ |
_mm_fmadd_ps | ++ |
_mm_fmadd_sd | ++ |
_mm_fmadd_ss | ++ |
_mm_fmaddsub_pd | ++ |
_mm_fmaddsub_ps | ++ |
_mm_fmsub_pd | ++ |
_mm_fmsub_ps | ++ |
_mm_fmsub_sd | ++ |
_mm_fmsub_ss | ++ |
_mm_fmsubadd_pd | ++ |
_mm_fmsubadd_ps | ++ |
_mm_fnmadd_pd | ++ |
_mm_fnmadd_ps | ++ |
_mm_fnmadd_sd | ++ |
_mm_fnmadd_ss | ++ |
_mm_fnmsub_pd | ++ |
_mm_fnmsub_ps | ++ |
_mm_fnmsub_sd | ++ |
_mm_fnmsub_ss | ++ |
_mm_i32gather_epi32 | ++ |
_mm_i32gather_epi64 | ++ |
_mm_i32gather_pd | ++ |
_mm_i32gather_ps | ++ |
_mm_i64gather_epi32 | ++ |
_mm_i64gather_epi64 | ++ |
_mm_i64gather_pd | ++ |
_mm_i64gather_ps | ++ |
_mm_madd52hi_avx_epu64 | ++ |
_mm_madd52hi_epu64 | ++ |
_mm_madd52lo_avx_epu64 | ++ |
_mm_madd52lo_epu64 | ++ |
_mm_mask_i32gather_epi32 | ++ |
_mm_mask_i32gather_epi64 | ++ |
_mm_mask_i32gather_pd | ++ |
_mm_mask_i32gather_ps | ++ |
_mm_mask_i64gather_epi32 | ++ |
_mm_mask_i64gather_epi64 | ++ |
_mm_mask_i64gather_pd | ++ |
_mm_mask_i64gather_ps | ++ |
_mm_maskload_epi32 | ++ |
_mm_maskload_epi64 | ++ |
_mm_maskload_pd | ++ |
_mm_maskload_ps | ++ |
_mm_maskstore_epi32 | ++ |
_mm_maskstore_epi64 | ++ |
_mm_maskstore_pd | ++ |
_mm_maskstore_ps | ++ |
_mm_permute_pd | ++ |
_mm_permute_ps | ++ |
_mm_permutevar_pd | ++ |
_mm_permutevar_ps | ++ |
_mm_sllv_epi32 | ++ |
_mm_sllv_epi64 | ++ |
_mm_sm3msg1_epi32 | ++ |
_mm_sm3msg2_epi32 | ++ |
_mm_sm3rnds2_epi32 | ++ |
_mm_sm4key4_epi32 | ++ |
_mm_sm4rnds4_epi32 | ++ |
_mm_srav_epi32 | ++ |
_mm_srlv_epi32 | ++ |
_mm_srlv_epi64 | ++ |
_mm_testc_pd | ++ |
_mm_testc_ps | ++ |
_mm_testnzc_pd | ++ |
_mm_testnzc_ps | ++ |
_mm_testz_pd | ++ |
_mm_testz_ps | ++ |
The list of AVX intrinsics came from Intel Intrinsics Guide.
+ +SSE is a 128-bit SIMD extension to X86. It is possible to migrate existing SSE code to leverage LoongArch LSX extension by rewriting the intrinsics or instructions manually, or by using tools like SIMD Everywhere to implement SSE intrinsics with LSX counterparts. But to unleash the full performance, you may want to port your code to LSX manually.
+Thankfully, LSX intrinsics adopt the same type as SSE: you can use the following familiar types for SIMD:
+Here is a table of a mapping from SSE intrinsics to their LSX counterpart (WIP):
+SSE | +LSX | +
---|---|
_mm_abs_epi16 | +__lsx_vsigncov_h | +
_mm_abs_epi32 | +__lsx_vsigncov_w | +
_mm_abs_epi8 | +__lsx_vsigncov_b | +
_mm_add_epi16 | +__lsx_vadd_h | +
_mm_add_epi32 | +__lsx_vadd_w | +
_mm_add_epi64 | +__lsx_vadd_d | +
_mm_add_epi8 | +__lsx_vadd_b | +
_mm_add_pd | +__lsx_vfadd_d | +
_mm_add_ps | +__lsx_vfadd_s | +
_mm_add_sd | +__lsx_vfadd_d + __lsx_vextrins_d | +
_mm_add_ss | +__lsx_vfadd_s + __lsx_vextrins_w | +
_mm_adds_epi16 | +__lsx_vsadd_h | +
_mm_adds_epi8 | +__lsx_vsadd_b | +
_mm_adds_epu16 | +__lsx_vsadd_hu | +
_mm_adds_epu8 | +__lsx_vsadd_bu | +
_mm_addsub_pd | ++ |
_mm_addsub_ps | ++ |
_mm_alignr_epi8 | ++ |
_mm_and_pd | +__lsx_vand_v | +
_mm_and_ps | +__lsx_vand_v | +
_mm_and_si128 | +__lsx_vand_v | +
_mm_andnot_pd | +__lsx_vandn_v | +
_mm_andnot_ps | +__lsx_vandn_v | +
_mm_andnot_si128 | +__lsx_vandn_v | +
_mm_avg_epu16 | +__lsx_vavgr_hu | +
_mm_avg_epu8 | +__lsx_vavgr_bu | +
_mm_blend_epi16 | ++ |
_mm_blend_pd | ++ |
_mm_blend_ps | ++ |
_mm_blendv_epi8 | ++ |
_mm_blendv_pd | ++ |
_mm_blendv_ps | ++ |
_mm_bslli_si128 | +__lsx_vbsll_v | +
_mm_bsrli_si128 | +__lsx_vbsrl_v | +
_mm_castpd_ps | +type conversion | +
_mm_castpd_si128 | +type conversion | +
_mm_castps_pd | +type conversion | +
_mm_castps_si128 | +type conversion | +
_mm_castsi128_pd | +type conversion | +
_mm_castsi128_ps | +type conversion | +
_mm_ceil_pd | +__lsx_vfrintrp_d | +
_mm_ceil_ps | +__lsx_vfrintrp_s | +
_mm_ceil_sd | +__lsx_vfrintrp_d + __lsx_vextrins_d | +
_mm_ceil_ss | +__lsx_vfrintrp_s + __lsx_vextrins_w | +
_mm_cmpeq_epi16 | +__lsx_vseq_h | +
_mm_cmpeq_epi32 | +__lsx_vseq_w | +
_mm_cmpeq_epi64 | +__lsx_vseq_d | +
_mm_cmpeq_epi8 | +__lsx_vseq_b | +
_mm_cmpeq_pd | +__lsx_vfcmp_ceq_d | +
_mm_cmpeq_ps | +__lsx_vfcmp_ceq_s | +
_mm_cmpeq_sd | +__lsx_vfcmp_ceq_d + __lsx_vextrins_d | +
_mm_cmpeq_ss | +__lsx_vfcmp_ceq_s + __lsx_vextrins_w | +
_mm_cmpestra | ++ |
_mm_cmpestrc | ++ |
_mm_cmpestri | ++ |
_mm_cmpestrm | ++ |
_mm_cmpestro | ++ |
_mm_cmpestrs | ++ |
_mm_cmpestrz | ++ |
_mm_cmpge_pd | +__lsx_vfcmp_cle_d | +
_mm_cmpge_ps | +__lsx_vfcmp_cle_s | +
_mm_cmpge_sd | +__lsx_vfcmp_cle_d + __lsx_vextrins_d | +
_mm_cmpge_ss | +__lsx_vfcmp_cle_s + __lsx_vextrins_w | +
_mm_cmpgt_epi16 | +__lsx_vslt_h | +
_mm_cmpgt_epi32 | +__lsx_vslt_w | +
_mm_cmpgt_epi64 | +__lsx_vslt_d | +
_mm_cmpgt_epi8 | +__lsx_vslt_b | +
_mm_cmpgt_pd | +__lsx_vfcmp_clt_d | +
_mm_cmpgt_ps | +__lsx_vfcmp_clt_s | +
_mm_cmpgt_sd | +__lsx_vfcmp_clt_d + __lsx_vextrins_d | +
_mm_cmpgt_ss | +__lsx_vfcmp_clt_s + __lsx_vextrins_w | +
_mm_cmpistra | ++ |
_mm_cmpistrc | ++ |
_mm_cmpistri | ++ |
_mm_cmpistrm | ++ |
_mm_cmpistro | ++ |
_mm_cmpistrs | ++ |
_mm_cmpistrz | ++ |
_mm_cmple_pd | +__lsx_vfcmp_cle_d | +
_mm_cmple_ps | +__lsx_vfcmp_cle_s | +
_mm_cmple_sd | +__lsx_vfcmp_cle_d + __lsx_vextrins_d | +
_mm_cmple_ss | +__lsx_vfcmp_cle_s + __lsx_vextrins_w | +
_mm_cmplt_epi16 | +__lsx_vslt_h | +
_mm_cmplt_epi32 | +__lsx_vslt_w | +
_mm_cmplt_epi8 | +__lsx_vslt_b | +
_mm_cmplt_pd | +__lsx_vfcmp_clt_d | +
_mm_cmplt_ps | +__lsx_vfcmp_clt_s | +
_mm_cmplt_sd | +__lsx_vfcmp_clt_d + __lsx_vextrins_d | +
_mm_cmplt_ss | +__lsx_vfcmp_clt_s + __lsx_vextrins_w | +
_mm_cmpneq_pd | +__lsx_vfcmp_cune_d | +
_mm_cmpneq_ps | +__lsx_vfcmp_cune_s | +
_mm_cmpneq_sd | +__lsx_vfcmp_cune_d + __lsx_vextrins_d | +
_mm_cmpneq_ss | +__lsx_vfcmp_cune_s + __lsx_vextrins_w | +
_mm_cmpnge_pd | +__lsx_vfcmp_cult_d | +
_mm_cmpnge_ps | +__lsx_vfcmp_cult_s | +
_mm_cmpnge_sd | +__lsx_vfcmp_cult_d + __lsx_vextrins_d | +
_mm_cmpnge_ss | +__lsx_vfcmp_cult_s + __lsx_vextrins_w | +
_mm_cmpngt_pd | +__lsx_vfcmp_cule_d | +
_mm_cmpngt_ps | +__lsx_vfcmp_cule_s | +
_mm_cmpngt_sd | +__lsx_vfcmp_cule_d + __lsx_vextrins_d | +
_mm_cmpngt_ss | +__lsx_vfcmp_cule_s + __lsx_vextrins_w | +
_mm_cmpnle_pd | +__lsx_vfcmp_cult_d | +
_mm_cmpnle_ps | +__lsx_vfcmp_cult_s | +
_mm_cmpnle_sd | +__lsx_vfcmp_cult_d + __lsx_vextrins_d | +
_mm_cmpnle_ss | +__lsx_vfcmp_cult_s + __lsx_vextrins_w | +
_mm_cmpnlt_pd | +__lsx_vfcmp_cule_d | +
_mm_cmpnlt_ps | +__lsx_vfcmp_cule_s | +
_mm_cmpnlt_sd | +__lsx_vfcmp_cule_d + __lsx_vextrins_d | +
_mm_cmpnlt_ss | +__lsx_vfcmp_cule_s + __lsx_vextrins_w | +
_mm_cmpord_pd | +__lsx_vfcmp_cor_d | +
_mm_cmpord_ps | +__lsx_vfcmp_cor_s | +
_mm_cmpord_sd | +__lsx_vfcmp_cor_d + __lsx_vextrins_d | +
_mm_cmpord_ss | +__lsx_vfcmp_cor_s + __lsx_vextrins_w | +
_mm_cmpunord_pd | +__lsx_vfcmp_cun_d | +
_mm_cmpunord_ps | +__lsx_vfcmp_cun_s | +
_mm_cmpunord_sd | +__lsx_vfcmp_cun_d + __lsx_vextrins_d | +
_mm_cmpunord_ss | +__lsx_vfcmp_cun_s + __lsx_vextrins_w | +
_mm_comieq_sd | ++ |
_mm_comieq_ss | ++ |
_mm_comige_sd | ++ |
_mm_comige_ss | ++ |
_mm_comigt_sd | ++ |
_mm_comigt_ss | ++ |
_mm_comile_sd | ++ |
_mm_comile_ss | ++ |
_mm_comilt_sd | ++ |
_mm_comilt_ss | ++ |
_mm_comineq_sd | ++ |
_mm_comineq_ss | ++ |
_mm_cvt_pi2ps | ++ |
_mm_cvt_ps2pi | ++ |
_mm_cvt_si2ss | ++ |
_mm_cvt_ss2si | ++ |
_mm_cvtepi16_epi32 | +__lsx_vsllwil_w_h | +
_mm_cvtepi16_epi64 | ++ |
_mm_cvtepi32_epi64 | +__lsx_vsllwil_d_w | +
_mm_cvtepi32_pd | +__lsx_vffintl_d_w | +
_mm_cvtepi32_ps | +__lsx_vffint_s_w | +
_mm_cvtepi8_epi16 | +__lsx_vsllwil_h_b | +
_mm_cvtepi8_epi32 | ++ |
_mm_cvtepi8_epi64 | ++ |
_mm_cvtepu16_epi32 | +__lsx_vsllwil_wu_hu | +
_mm_cvtepu16_epi64 | ++ |
_mm_cvtepu32_epi64 | +__lsx_vsllwil_du_wu | +
_mm_cvtepu8_epi16 | +__lsx_vsllwil_hu_bu | +
_mm_cvtepu8_epi32 | ++ |
_mm_cvtepu8_epi64 | ++ |
_mm_cvtpd_epi32 | +__lsx_vftint_w_d | +
_mm_cvtpd_pi32 | ++ |
_mm_cvtpd_ps | +__lsx_vfcvt_s_d | +
_mm_cvtpi16_ps | ++ |
_mm_cvtpi32_pd | ++ |
_mm_cvtpi32_ps | ++ |
_mm_cvtpi32x2_ps | ++ |
_mm_cvtpi8_ps | ++ |
_mm_cvtps_epi32 | +__lsx_vftint_w_s | +
_mm_cvtps_pd | +__lsx_vfcvtl_d_s | +
_mm_cvtps_pi16 | ++ |
_mm_cvtps_pi32 | ++ |
_mm_cvtps_pi8 | ++ |
_mm_cvtpu16_ps | ++ |
_mm_cvtpu8_ps | ++ |
_mm_cvtsd_f64 | ++ |
_mm_cvtsd_si32 | ++ |
_mm_cvtsd_si64 | ++ |
_mm_cvtsd_si64x | ++ |
_mm_cvtsd_ss | ++ |
_mm_cvtsi128_si32 | +__lsx_vpickve2gr_w | +
_mm_cvtsi128_si64 | +__lsx_vpickve2gr_d | +
_mm_cvtsi128_si64x | +__lsx_vpickve2gr_d | +
_mm_cvtsi32_sd | ++ |
_mm_cvtsi32_si128 | ++ |
_mm_cvtsi32_ss | ++ |
_mm_cvtsi64_sd | ++ |
_mm_cvtsi64_si128 | ++ |
_mm_cvtsi64_ss | ++ |
_mm_cvtsi64x_sd | ++ |
_mm_cvtsi64x_si128 | ++ |
_mm_cvtss_f32 | ++ |
_mm_cvtss_sd | ++ |
_mm_cvtss_si32 | ++ |
_mm_cvtss_si64 | ++ |
_mm_cvtt_ps2pi | ++ |
_mm_cvtt_ss2si | ++ |
_mm_cvttpd_epi32 | +__lsx_vftint_w_d | +
_mm_cvttpd_pi32 | ++ |
_mm_cvttps_epi32 | +__lsx_vftint_w_s | +
_mm_cvttps_pi32 | ++ |
_mm_cvttsd_si32 | ++ |
_mm_cvttsd_si64 | ++ |
_mm_cvttsd_si64x | ++ |
_mm_cvttss_si32 | ++ |
_mm_cvttss_si64 | ++ |
_mm_div_pd | +__lsx_vfdiv_d | +
_mm_div_ps | +__lsx_vfdiv_s | +
_mm_div_sd | +__lsx_vfdiv_d + __lsx_vextrins_d | +
_mm_div_ss | +__lsx_vfdiv_s + __lsx_vextrins_w | +
_mm_dp_pd | ++ |
_mm_dp_ps | ++ |
_mm_extract_epi16 | +__lsx_vpickve2gr_h | +
_mm_extract_epi32 | +__lsx_vpickve2gr_w | +
_mm_extract_epi64 | +__lsx_vpickve2gr_d | +
_mm_extract_epi8 | +__lsx_vpickve2gr_b | +
_mm_extract_ps | +__lsx_vpickve2gr_w | +
_mm_floor_pd | +__lsx_vfrintrm_d | +
_mm_floor_ps | +__lsx_vfrintrm_s | +
_mm_floor_sd | +__lsx_vfrintrm_d + __lsx_vextrins_d | +
_mm_floor_ss | +__lsx_vfrintrm_s + __lsx_vextrins_w | +
_mm_hadd_epi16 | ++ |
_mm_hadd_epi32 | ++ |
_mm_hadd_pd | ++ |
_mm_hadd_ps | ++ |
_mm_hadds_epi16 | ++ |
_mm_hsub_epi16 | ++ |
_mm_hsub_epi32 | ++ |
_mm_hsub_pd | ++ |
_mm_hsub_ps | ++ |
_mm_hsubs_epi16 | ++ |
_mm_insert_epi16 | +__lsx_vinsgr2vr_h | +
_mm_insert_epi32 | +__lsx_vinsgr2vr_w | +
_mm_insert_epi64 | +__lsx_vinsgr2vr_d | +
_mm_insert_epi8 | +__lsx_vinsgr2vr_b | +
_mm_insert_ps | +__lsx_vinsgr2vr_w | +
_mm_lddqu_si128 | ++ |
_mm_load_pd | +__lsx_vld | +
_mm_load_pd1 | +__lsx_vldrepl_d | +
_mm_load_ps | +__lsx_vld | +
_mm_load_ps1 | +__lsx_vldrepl_w | +
_mm_load_sd | ++ |
_mm_load_si128 | ++ |
_mm_load_ss | ++ |
_mm_load1_pd | +__lsx_vldrepl_d | +
_mm_load1_ps | +__lsx_vldrepl_w | +
_mm_loaddup_pd | ++ |
_mm_loadh_pd | ++ |
_mm_loadh_pi | ++ |
_mm_loadl_epi64 | ++ |
_mm_loadl_pd | ++ |
_mm_loadl_pi | ++ |
_mm_loadr_pd | +__lsx_vld + __lsx_vshuf4i_w | +
_mm_loadr_ps | +__lsx_vld + __lsx_vshuf4i_w | +
_mm_loadu_pd | +__lsx_vld | +
_mm_loadu_ps | +__lsx_vld | +
_mm_loadu_si128 | +__lsx_vld | +
_mm_loadu_si16 | ++ |
_mm_loadu_si32 | ++ |
_mm_loadu_si64 | ++ |
_mm_madd_epi16 | ++ |
_mm_maddubs_epi16 | ++ |
_mm_maskmoveu_si128 | ++ |
_mm_max_epi16 | +__lsx_vmax_h | +
_mm_max_epi32 | +__lsx_vmax_w | +
_mm_max_epi8 | +__lsx_vmax_b | +
_mm_max_epu16 | +__lsx_vmax_hu | +
_mm_max_epu32 | +__lsx_vmax_wu | +
_mm_max_epu8 | +__lsx_vmax_bu | +
_mm_max_pd | +__lsx_vfmax_d | +
_mm_max_ps | +__lsx_vfmax_s | +
_mm_max_sd | +__lsx_vfmax_d + __lsx_vextrins_d | +
_mm_max_ss | +__lsx_vfmax_s + __lsx_vextrins_w | +
_mm_min_epi16 | +__lsx_vmin_h | +
_mm_min_epi32 | +__lsx_vmin_w | +
_mm_min_epi8 | +__lsx_vmin_b | +
_mm_min_epu16 | +__lsx_vmin_hu | +
_mm_min_epu32 | +__lsx_vmin_wu | +
_mm_min_epu8 | +__lsx_vmin_bu | +
_mm_min_pd | +__lsx_vfmin_d | +
_mm_min_ps | +__lsx_vfmin_s | +
_mm_min_sd | +__lsx_vfmin_d + __lsx_vextrins_d | +
_mm_min_ss | +__lsx_vfmin_s + __lsx_vextrins_w | +
_mm_minpos_epu16 | ++ |
_mm_move_epi64 | ++ |
_mm_move_sd | +__lsx_vextrins_d | +
_mm_move_ss | +__lsx_vextrins_w | +
_mm_movedup_pd | ++ |
_mm_movehdup_ps | ++ |
_mm_movehl_ps | +__lsx_vilvh_d | +
_mm_moveldup_ps | ++ |
_mm_movelh_ps | +__lsx_vilvl_d | +
_mm_movemask_epi8 | ++ |
_mm_movemask_pd | ++ |
_mm_movemask_ps | +__lsx_vmskltz_w + __lsx_vpickve2gr_wu | +
_mm_movepi64_pi64 | ++ |
_mm_movpi64_epi64 | ++ |
_mm_mpsadbw_epu8 | ++ |
_mm_mul_epi32 | ++ |
_mm_mul_epu32 | ++ |
_mm_mul_pd | +__lsx_vfmul_d | +
_mm_mul_ps | +__lsx_vfmul_s | +
_mm_mul_sd | ++ |
_mm_mul_ss | ++ |
_mm_mulhi_epi16 | ++ |
_mm_mulhi_epu16 | ++ |
_mm_mulhrs_epi16 | ++ |
_mm_mullo_epi16 | ++ |
_mm_mullo_epi32 | ++ |
_mm_or_pd | +__lsx_vor_v | +
_mm_or_ps | +__lsx_vor_v | +
_mm_or_si128 | +__lsx_vor_v | +
_mm_packs_epi16 | ++ |
_mm_packs_epi32 | ++ |
_mm_packus_epi16 | ++ |
_mm_packus_epi32 | ++ |
_mm_rcp_ps | +__lsx_vfrecip_s | +
_mm_rcp_ss | ++ |
_mm_round_pd | +__lsx_vfrintr*_d | +
_mm_round_ps | +__lsx_vfrintr*_s | +
_mm_round_sd | ++ |
_mm_round_ss | ++ |
_mm_rsqrt_ps | +__lsx_vfrsqrt_s | +
_mm_rsqrt_ss | ++ |
_mm_sad_epu8 | ++ |
_mm_set_epi16 | ++ |
_mm_set_epi32 | ++ |
_mm_set_epi64 | ++ |
_mm_set_epi64x | ++ |
_mm_set_epi8 | ++ |
_mm_set_pd | ++ |
_mm_set_pd1 | +__lsx_vdrepl_d/__lsx_vreplgr2vd_d | +
_mm_set_ps | ++ |
_mm_set_ps1 | +__lsx_vdrepl_w/__lsx_vreplgr2vr_w | +
_mm_set_sd | ++ |
_mm_set_ss | ++ |
_mm_set1_epi16 | +__lsx_vreplgr2vr_h | +
_mm_set1_epi32 | +__lsx_vreplgr2vr_w | +
_mm_set1_epi64 | +__lsx_vreplgr2vr_d | +
_mm_set1_epi64x | ++ |
_mm_set1_epi8 | +__lsx_vreplgr2vr_b | +
_mm_set1_pd | ++ |
_mm_set1_ps | ++ |
_mm_setr_epi16 | +use lsxintrin.h--v8i16 to reverse construction | +
_mm_setr_epi32 | +use lsxintrin.h--v4i32 to reverse construction | +
_mm_setr_epi64 | +use lsxintrin.h--v2i64 to reverse construction | +
_mm_setr_epi8 | +use lsxintrin.h--v16i8 to reverse construction | +
_mm_setr_pd | +use lsxintrin.h--v2f64 to reverse construction | +
_mm_setr_ps | +use lsxintrin.h--v4f32 to reverse construction | +
_mm_setzero_pd | +(__m128d)__lsx_vldi(0) | +
_mm_setzero_ps | +(__m128)__lsx_vldi(0) | +
_mm_setzero_si128 | +__lsx_vldi(0) | +
_mm_shuffle_epi32 | ++ |
_mm_shuffle_epi8 | ++ |
_mm_shuffle_pd | ++ |
_mm_shuffle_ps | ++ |
_mm_shufflehi_epi16 | ++ |
_mm_shufflelo_epi16 | ++ |
_mm_sign_epi16 | ++ |
_mm_sign_epi32 | ++ |
_mm_sign_epi8 | ++ |
_mm_sll_epi16 | +__lsx_vsll_h | +
_mm_sll_epi32 | +__lsx_vsll_w | +
_mm_sll_epi64 | +__lsx_vsll_d | +
_mm_slli_epi16 | +__lsx_vslli_h | +
_mm_slli_epi32 | +__lsx_vslli_w | +
_mm_slli_epi64 | +__lsx_vslli_d | +
_mm_slli_si128 | ++ |
_mm_sqrt_pd | +__lsx_vfsqrt_d | +
_mm_sqrt_ps | +__lsx_vfsqrt_s | +
_mm_sqrt_sd | ++ |
_mm_sqrt_ss | ++ |
_mm_sra_epi16 | +__lsx_vsra_h | +
_mm_sra_epi32 | +__lsx_vsra_w | +
_mm_srai_epi16 | +__lsx_vsrai_h | +
_mm_srai_epi32 | +__lsx_vsrai_w | +
_mm_srl_epi16 | +__lsx_vsrl_h | +
_mm_srl_epi32 | +__lsx_vsrl_w | +
_mm_srl_epi64 | +__lsx_vsrl_d | +
_mm_srli_epi16 | +__lsx_vsrli_h | +
_mm_srli_epi32 | +__lsx_vsrli_w | +
_mm_srli_epi64 | +__lsx_vsrli_d | +
_mm_srli_si128 | ++ |
_mm_store_pd | +__lsx_vst | +
_mm_store_pd1 | ++ |
_mm_store_ps | +__lsx_vst | +
_mm_store_ps1 | ++ |
_mm_store_sd | ++ |
_mm_store_si128 | ++ |
_mm_store_ss | +__lsx_vstelm_w | +
_mm_store1_pd | +__lsx_vreplvei_d + __lsx_vst | +
_mm_store1_ps | +__lsx_vreplvei_w + __lsx_vst | +
_mm_storeh_pd | ++ |
_mm_storeh_pi | ++ |
_mm_storel_epi64 | ++ |
_mm_storel_pd | ++ |
_mm_storel_pi | ++ |
_mm_storer_pd | ++ |
_mm_storer_ps | +__lsx_vshuf4i_w + __lsx_vst | +
_mm_storeu_pd | +__lsx_vst | +
_mm_storeu_ps | +__lsx_vst | +
_mm_storeu_si128 | ++ |
_mm_storeu_si16 | ++ |
_mm_storeu_si32 | ++ |
_mm_storeu_si64 | ++ |
_mm_stream_load_si128 | ++ |
_mm_stream_pd | ++ |
_mm_stream_ps | ++ |
_mm_stream_si128 | ++ |
_mm_sub_epi16 | +__lsx_vsub_h | +
_mm_sub_epi32 | +__lsx_vsub_w | +
_mm_sub_epi64 | +__lsx_vsub_d | +
_mm_sub_epi8 | +__lsx_vsub_b | +
_mm_sub_pd | +__lsx_vfsub_s | +
_mm_sub_ps | +__lsx_vfsub_s | +
_mm_sub_sd | ++ |
_mm_sub_ss | ++ |
_mm_subs_epi16 | ++ |
_mm_subs_epi8 | ++ |
_mm_subs_epu16 | ++ |
_mm_subs_epu8 | ++ |
_mm_test_all_ones | ++ |
_mm_test_all_zeros | ++ |
_mm_test_mix_ones_zeros | ++ |
_mm_testc_si128 | ++ |
_mm_testnzc_si128 | ++ |
_mm_testz_si128 | ++ |
_MM_TRANSPOSE4_PS | ++ |
_mm_ucomieq_sd | ++ |
_mm_ucomieq_ss | ++ |
_mm_ucomige_sd | ++ |
_mm_ucomige_ss | ++ |
_mm_ucomigt_sd | ++ |
_mm_ucomigt_ss | ++ |
_mm_ucomile_sd | ++ |
_mm_ucomile_ss | ++ |
_mm_ucomilt_sd | ++ |
_mm_ucomilt_ss | ++ |
_mm_ucomineq_sd | ++ |
_mm_ucomineq_ss | ++ |
_mm_undefined_pd | ++ |
_mm_undefined_ps | ++ |
_mm_undefined_si128 | ++ |
_mm_unpackhi_epi16 | +__lsx_vilvh_h | +
_mm_unpackhi_epi32 | +__lsx_vilvh_w | +
_mm_unpackhi_epi64 | +__lsx_vilvh_d | +
_mm_unpackhi_epi8 | +__lsx_vilvh_b | +
_mm_unpackhi_pd | +__lsx_vilvh_d | +
_mm_unpackhi_ps | +__lsx_vilvh_w | +
_mm_unpacklo_epi16 | +__lsx_vilvl_h | +
_mm_unpacklo_epi32 | +__lsx_vilvl_w | +
_mm_unpacklo_epi64 | +__lsx_vilvl_d | +
_mm_unpacklo_epi8 | +__lsx_vilvl_b | +
_mm_unpacklo_pd | +__lsx_vilvl_d | +
_mm_unpacklo_ps | +__lsx_vilvl_w | +
_mm_xor_pd | +__lsx_vxor_v | +
_mm_xor_ps | +__lsx_vxor_v | +
_mm_xor_si128 | +__lsx_vxor_v | +
The list of SSE intrinsics came from Intel Intrinsics Guide.
+ +