404
+ +Page not found
+ + +diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..bf3639ca --- /dev/null +++ b/404.html @@ -0,0 +1,172 @@ + + +
+ + + + +Page not found
+ + +This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:
+__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvbitsel.v xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise selection: for each bit position, if the bit in c
equals to one, copy the bit from b
to dst
, otherwise copy from a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseli.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise selection: for each bit position, if the bit in a
equals to one, copy the bit from imm
to dst
, otherwise copy from b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.b xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.h xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.w xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.d xr, xr, xr
+CPU Flags: LASX
+
+Clear the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.b xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.h xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.w xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.d xr, xr, imm
+CPU Flags: LASX
+
+Clear the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitset_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.b xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitset_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.h xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitset_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.w xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitset_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.d xr, xr, xr
+CPU Flags: LASX
+
+Set the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.b xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.h xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.w xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.d xr, xr, imm
+CPU Flags: LASX
+
+Set the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.b xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.h xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.w xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.d xr, xr, xr
+CPU Flags: LASX
+
+Toggle the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.b xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.h xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.w xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.d xr, xr, imm
+CPU Flags: LASX
+
+Toggle the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvclo_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.b xr, xr
+CPU Flags: LASX
+
+Count leading ones of 8-bit elements in a
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clo(a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvclo_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.h xr, xr
+CPU Flags: LASX
+
+Count leading ones of 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = clo(a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvclo_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.w xr, xr
+CPU Flags: LASX
+
+Count leading ones of 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = clo(a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvclo_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.d xr, xr
+CPU Flags: LASX
+
+Count leading ones of 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clo(a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvclz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.b xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 8-bit elements in a
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clz(a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvclz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.h xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = clz(a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvclz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.w xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = clz(a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvclz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.d xr, xr
+CPU Flags: LASX
+
+Count leading zeros of 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clz(a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvexth_h_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.h.b xr, xr
+CPU Flags: LASX
+
+Extend signed 8-bit elements in the higher half of a
to 16-bit.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[16 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvexth_hu_bu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.hu.bu xr, xr
+CPU Flags: LASX
+
+Extend unsigned 8-bit elements in the higher half of a
to 16-bit.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[16 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvexth_w_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.w.h xr, xr
+CPU Flags: LASX
+
+Extend signed 16-bit elements in the higher half of a
to 32-bit.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[8 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvexth_wu_hu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.wu.hu xr, xr
+CPU Flags: LASX
+
+Extend unsigned 16-bit elements in the higher half of a
to 32-bit.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[8 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvexth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.d.w xr, xr
+CPU Flags: LASX
+
+Extend signed 32-bit elements in the higher half of a
to 64-bit.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[4 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvexth_du_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.du.wu xr, xr
+CPU Flags: LASX
+
+Extend unsigned 32-bit elements in the higher half of a
to 64-bit.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[4 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvexth_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.q.d xr, xr
+CPU Flags: LASX
+
+Extend signed 64-bit elements in the higher half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvexth_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.qu.du xr, xr
+CPU Flags: LASX
+
+Extend unsigned 64-bit elements in the higher half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvextl_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.q.d xr, xr
+CPU Flags: LASX
+
+Extend signed 64-bit elements in the lower half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvextl_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.qu.du xr, xr
+CPU Flags: LASX
+
+Extend unsigned 64-bit elements in the lower half of a
to 128-bit.
int i;
+for (i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i];
+}
+for (; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.b xr, xr, imm
+CPU Flags: LASX
+
+Extract one 8-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 16; i++) {
+ dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+for (; i < 32; i++) {
+ dst.byte[i] =
+ (i - 16 == ((imm >> 4) & 15)) ? b.byte[(imm & 15) + 16] : a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.h xr, xr, imm
+CPU Flags: LASX
+
+Extract one 16-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (i - 8 == ((imm >> 4) & 7)) ? b.half[(imm & 7) + 8] : a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.w xr, xr, imm
+CPU Flags: LASX
+
+Extract one 32-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (i - 4 == ((imm >> 4) & 3)) ? b.word[(imm & 3) + 4] : a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +3.96 | +
__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.d xr, xr, imm
+CPU Flags: LASX
+
+Extract one 64-bit element in b
and insert it to a
according to imm
.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+for (; i < 4; i++) {
+ dst.dword[i] =
+ (i - 2 == ((imm >> 4) & 1)) ? b.dword[(imm & 1) + 2] : a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpcnt_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.b xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 8-bit elements in a
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = popcount(a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvpcnt_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.h xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = popcount(a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvpcnt_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.w xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = popcount(a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvpcnt_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.d xr, xr
+CPU Flags: LASX
+
+Count the number of ones (population, popcount) in 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = popcount(a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
int __lasx_xbz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvseteqz.v fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if the whole vector a
equals to zero.
dst = a.qword[0] == 0 && a.qword[1] == 0;
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbnz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetnez.v fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if the whole vector a
is non-zero.
dst = a.qword[0] != 0 || a.qword[1] != 0;
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.b fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 8-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 32; i++) {
+ if (a.byte[i] == 0) {
+ dst = 1;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.h fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 16-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 16; i++) {
+ if (a.half[i] == 0) {
+ dst = 1;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.w fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 32-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 8; i++) {
+ if (a.word[i] == 0) {
+ dst = 1;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.d fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if any 64-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 4; i++) {
+ if (a.dword[i] == 0) {
+ dst = 1;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbnz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.b fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 8-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 32; i++) {
+ if (a.byte[i] == 0) {
+ dst = 0;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbnz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.h fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 16-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 16; i++) {
+ if (a.half[i] == 0) {
+ dst = 0;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbnz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.w fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 32-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 8; i++) {
+ if (a.word[i] == 0) {
+ dst = 0;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lasx_xbnz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.d fcc, xr; bcnez
+CPU Flags: LASX
+
+Expected to be used in branches: branch if all 64-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 4; i++) {
+ if (a.dword[i] == 0) {
+ dst = 0;
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.96 | +
__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.96 | +
__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.95 | +
__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.96 | +
__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.s xr, xr, xr
+CPU Flags: LASX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 8; i++) {
+ if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.d xr, xr, xr
+CPU Flags: LASX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256 __lasx_xvfadd_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfadd.s xr, xr, xr
+CPU Flags: LASX
+
+Add single precision floating point elements in a
to elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256d __lasx_xvfadd_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfadd.d xr, xr, xr
+CPU Flags: LASX
+
+Add double precision floating point elements in a
to elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.s xr, xr, xr
+CPU Flags: LASX
+
+Divide single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18 | +
__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.d xr, xr, xr
+CPU Flags: LASX
+
+Divide double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 21.48 | +0.25 | +
__m256 __lasx_xvfmax_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of single precision floating point elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256d __lasx_xvfmax_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of double precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.s xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.d xr, xr, xr
+CPU Flags: LASX
+
+Compute maximum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256 __lasx_xvfmin_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of single precision floating point elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256d __lasx_xvfmin_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of double precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256 __lasx_xvfmina_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmina.s xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256d __lasx_xvfmina_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmina.d xr, xr, xr
+CPU Flags: LASX
+
+Compute minimum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256 __lasx_xvfmul_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmul.s xr, xr, xr
+CPU Flags: LASX
+
+Multiply single precision floating point elements in a
and elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256d __lasx_xvfmul_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmul.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply double precision floating point elements in a
and elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256 __lasx_xvfsub_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfsub.s xr, xr, xr
+CPU Flags: LASX
+
+Subtract single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256d __lasx_xvfsub_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfsub.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256 __lasx_xvflogb_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvflogb.s xr, xr
+CPU Flags: LASX
+
+Compute 2-based logarithm of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = log2(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256d __lasx_xvflogb_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvflogb.d xr, xr
+CPU Flags: LASX
+
+Compute 2-based logarithm of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = log2(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256 __lasx_xvfsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.s xr, xr
+CPU Flags: LASX
+
+Compute square root of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +15 | +0.08 | +
__m256d __lasx_xvfsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.d xr, xr
+CPU Flags: LASX
+
+Compute square root of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +36 | +0.06 | +
__m256 __lasx_xvfrsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.s xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +25 | +0.05 | +
__m256d __lasx_xvfrsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.d xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +15 | +0.04 | +
__m256 __lasx_xvfrecip_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.s xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1 / a.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +19 | +0.18 | +
__m256d __lasx_xvfrecip_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.d xr, xr
+CPU Flags: LASX
+
+Compute reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1 / a.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +23 | +0.25 | +
__m256 __lasx_xvfrsqrte_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.s xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+__m256d __lasx_xvfrsqrte_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.d xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+__m256 __lasx_xvfrecipe_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.s xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+__m256d __lasx_xvfrecipe_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.d xr, xr
+CPU Flags: LASX
+
+Compute estimated reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+
+ __m256d __lasx_xvfcvth_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.d.s xr, xr
+CPU Flags: LASX
+
+Convert single precision floating point elements in higher half of a
to double precision.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp32[4 + i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256d __lasx_xvfcvtl_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.d.s xr, xr
+CPU Flags: LASX
+
+Convert single precision floating point elements in lower half of a
to double precision.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.s.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double precision floating point elements in a
and b
to double precision.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ dst.fp32[i] = b.fp64[i];
+ } else {
+ dst.fp32[i] = a.fp64[i - 4];
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256 __lasx_xvfcvth_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.s.h xr, xr
+CPU Flags: LASX
+
+Convert half precision floating point elements in higher half of a
to single precision.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp16[8 + i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256 __lasx_xvfcvtl_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.s.h xr, xr
+CPU Flags: LASX
+
+Convert half precision floating point elements in lower half of a
to single precision.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp16[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.h.s xr, xr, xr
+CPU Flags: LASX
+
+Convert single precision floating point elements in a
and b
to half precision.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ dst.fp16[i] = b.fp32[i];
+ } else {
+ dst.fp16[i] = a.fp32[i - 8];
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256d __lasx_xvffinth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffinth.d.w xr, xr
+CPU Flags: LASX
+
+Convert 32-bit integer elements in higher part of a
to double precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256d __lasx_xvffintl_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffintl.d.w xr, xr
+CPU Flags: LASX
+
+Convert 32-bit integer elements in lower part of a
to double precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256d __lasx_xvffint_d_l (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.l xr, xr
+CPU Flags: LASX
+
+Convert signed 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256d __lasx_xvffint_d_lu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.lu xr, xr
+CPU Flags: LASX
+
+Convert unsigned 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256 __lasx_xvffint_s_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.w xr, xr
+CPU Flags: LASX
+
+Convert signed 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256 __lasx_xvffint_s_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.wu xr, xr
+CPU Flags: LASX
+
+Convert unsigned 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvffint.s.l xr, xr, xr
+CPU Flags: LASX
+
+Convert 64-bit integer elements in a
and b
to double-precision floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] =
+ (i < 4) ? (f32)(s32)a.dword[i]
+ : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintl.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftinth_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftinth.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrml_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrml.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrmh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrmh.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrpl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrpl.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrph_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrph.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrzl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzl.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrzh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzh.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrnel_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrnel.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrneh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrneh.l.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftint_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftint_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrm_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrm_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrp_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrp_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrz_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrz_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrne_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.l.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrne_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftint_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.lu.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftint_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.wu.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrz_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.lu.d xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftintrz_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.wu.s xr, xr
+CPU Flags: LASX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, rounding towards zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftint.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.d xr, xr, xr
+CPU Flags: LASX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i < 2)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvfclass_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfclass.d xr, xr
+CPU Flags: LASX
+
+Classifiy each double precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvfclass_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfclass.s xr, xr
+CPU Flags: LASX
+
+Classifiy each single precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256 __lasx_xvfrint_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrint.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256d __lasx_xvfrint_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrint.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256 __lasx_xvfrintrp_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256d __lasx_xvfrintrp_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256 __lasx_xvfrintrm_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256d __lasx_xvfrintrm_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256 __lasx_xvfrintrz_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256d __lasx_xvfrintrz_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256 __lasx_xvfrintrne_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.s xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256d __lasx_xvfrintrne_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.d xr, xr
+CPU Flags: LASX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m256i __lasx_xvseq_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.b xr, xr, xr
+CPU Flags: LASX
+
+Compare the 8-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvseq_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.h xr, xr, xr
+CPU Flags: LASX
+
+Compare the 16-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvseq_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.w xr, xr, xr
+CPU Flags: LASX
+
+Compare the 32-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvseq_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.d xr, xr, xr
+CPU Flags: LASX
+
+Compare the 64-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.b xr, xr, imm
+CPU Flags: LASX
+
+Compare the 8-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.h xr, xr, imm
+CPU Flags: LASX
+
+Compare the 16-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.w xr, xr, imm
+CPU Flags: LASX
+
+Compare the 32-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.d xr, xr, imm
+CPU Flags: LASX
+
+Compare the 64-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslt_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.b xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvslt_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.bu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvslt_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.h xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvslt_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.hu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvslt_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.w xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvslt_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.wu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvslt_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.d xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvslt_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.du xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.b xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.bu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.h xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.hu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.w xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.wu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.d xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.du xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvsle_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.b xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsle_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.bu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsle_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.h xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsle_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.hu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsle_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.w xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsle_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.wu xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsle_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.d xr, xr, xr
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvsle_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.du xr, xr, xr
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.b xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.bu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.h xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.hu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.w xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.wu xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.d xr, xr, imm
+CPU Flags: LASX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.du xr, xr, imm
+CPU Flags: LASX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.b xr, xr, xr
+CPU Flags: LASX
+
+Add 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.h xr, xr, xr
+CPU Flags: LASX
+
+Add 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] + b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.w xr, xr, xr
+CPU Flags: LASX
+
+Add 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] + b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.d xr, xr, xr
+CPU Flags: LASX
+
+Add 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvadd_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.q xr, xr, xr
+CPU Flags: LASX
+
+Add 128-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = a.qword[i] + b.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvabsd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.b xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvabsd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.h xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvabsd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.w xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvabsd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.d xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvabsd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.du xr, xr, xr
+CPU Flags: LASX
+
+Compute absolute difference of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvadda_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.b xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvadda_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.h xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvadda_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.w xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvadda_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.d xr, xr, xr
+CPU Flags: LASX
+
+Add absolute of 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.bu xr, xr, imm
+CPU Flags: LASX
+
+Add 8-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.hu xr, xr, imm
+CPU Flags: LASX
+
+Add 16-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.wu xr, xr, imm
+CPU Flags: LASX
+
+Add 32-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.du xr, xr, imm
+CPU Flags: LASX
+
+Add 64-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Add even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvavg_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.b xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavg_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavg_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.h xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavg_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavg_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.w xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavg_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavg_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.d xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvavg_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.du xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvavgr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.b xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavgr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.h xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavgr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.w xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvavgr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.d xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvavgr_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.du xr, xr, xr
+CPU Flags: LASX
+
+Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvdiv_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.b xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 32 | +0.06 | +
__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.bu xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 33 | +0.06 | +
__m256i __lasx_xvdiv_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.h xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17 | +0.12 | +
__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.hu xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 22 | +0.11 | +
__m256i __lasx_xvdiv_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.w xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18 | +
__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.wu xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18 | +
__m256i __lasx_xvdiv_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.d xr, xr, xr
+CPU Flags: LASX
+
+Divide signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25 | +
__m256i __lasx_xvdiv_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.du xr, xr, xr
+CPU Flags: LASX
+
+Divide unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25 | +
__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.h.b xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 8-bit elements in a
to even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 8-bit elements in a
to even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.w.h xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 16-bit elements in a
to even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 16-bit elements in a
to even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.d.w xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 32-bit elements in a
to even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 32-bit elements in a
to even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.q.d xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned signed 64-bit elements in a
to even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+Add odd-positioned unsigned 64-bit elements in a
to even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.h.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 8-bit elements in a
by even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 8-bit elements in a
by even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.w.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 16-bit elements in a
by even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 16-bit elements in a
by even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.d.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 32-bit elements in a
by even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 32-bit elements in a
by even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.q.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 64-bit elements in a
by even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 64-bit elements in a
by even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply 8-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply 16-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply 32-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply 64-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] =
+ (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+ (u32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+ (u64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+ (u128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m256i __lasx_xvmax_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.b xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmax_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmax_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.h xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmax_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmax_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.w xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmax_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmax_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.d xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvmax_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.du xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.b xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.bu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.h xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.hu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.w xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.wu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.d xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.du xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvmin_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.b xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmin_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.bu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmin_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.h xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmin_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.hu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmin_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.w xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmin_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.wu xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmin_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.d xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvmin_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.du xr, xr, xr
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.b xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.bu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.h xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.hu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.w xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.wu xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.d xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.du xr, xr, imm
+CPU Flags: LASX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m256i __lasx_xvmod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.b xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 37 | +0.06 | +
__m256i __lasx_xvmod_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.bu xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 37 | +0.06 | +
__m256i __lasx_xvmod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.h xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 21 | +0.12 | +
__m256i __lasx_xvmod_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.hu xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 21 | +0.11 | +
__m256i __lasx_xvmod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.w xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 15 | +0.18 | +
__m256i __lasx_xvmod_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.wu xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 13 | +0.18 | +
__m256i __lasx_xvmod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.d xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25 | +
__m256i __lasx_xvmod_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.du xr, xr, xr
+CPU Flags: LASX
+
+Modulo residual unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25 | +
__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply 8-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply 16-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply 32-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply 64-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply signed 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmuh_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply unsigned 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmul_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmul_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] * b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmul_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] * b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmul_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m256i __lasx_xvneg_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.b xr, xr
+CPU Flags: LASX
+
+Negate 8-bit elements in a
and save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = -a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvneg_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.h xr, xr
+CPU Flags: LASX
+
+Negate 16-bit elements in a
and save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = -a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvneg_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.w xr, xr
+CPU Flags: LASX
+
+Negate 32-bit elements in a
and save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = -a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvneg_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.d xr, xr
+CPU Flags: LASX
+
+Negate 64-bit elements in a
and save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = -a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.b xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.bu xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.h xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.hu xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.w xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.wu xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.d xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsadd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.du xr, xr, xr
+CPU Flags: LASX
+
+Saturing add the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.b xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.bu xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.h xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.hu xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.w xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.wu xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.d xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvssub_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.du xr, xr, xr
+CPU Flags: LASX
+
+Saturing subtract the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] - b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] - b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsub_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.q xr, xr, xr
+CPU Flags: LASX
+
+Subtract 128-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = a.qword[i] - b.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.bu xr, xr, imm
+CPU Flags: LASX
+
+Subtract 8-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.hu xr, xr, imm
+CPU Flags: LASX
+
+Subtract 16-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.wu xr, xr, imm
+CPU Flags: LASX
+
+Subtract 32-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.du xr, xr, imm
+CPU Flags: LASX
+
+Subtract 64-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+Subtract even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+Subtract odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvand_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvand.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise AND between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvandi.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise AND between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] & imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvandn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvandn.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise ANDN between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvnor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvnor.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise NOR between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvnori.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise NOR between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvor.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise OR between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvori.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise OR between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] | imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvorn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvorn.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise ORN between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvxor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvxor.v xr, xr, xr
+CPU Flags: LASX
+
+Compute bitwise XOR between elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvxori.b xr, xr, imm
+CPU Flags: LASX
+
+Compute bitwise XOR between elements in a
and imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] ^ imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvld xr, r, imm
+CPU Flags: LASX
+
+Read whole vector from memory address addr + offset
, save the data into dst
.
dst = memory_load(256, addr + offset);
+
+__m256i __lasx_xvldx (void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvldx xr, r, r
+CPU Flags: LASX
+
+Read whole vector from memory address addr + offset
, save the data into dst
.
dst = memory_load(256, addr + offset);
+
+__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.b xr, r, imm
+CPU Flags: LASX
+
+Read 8-bit data from memory address addr + (offset << 0)
, replicate the data to all vector lanes and save into dst
.
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 32; i++) {
+ dst.byte[i] = data;
+}
+
+__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.h xr, r, imm
+CPU Flags: LASX
+
+Read 16-bit data from memory address addr + (offset << 1)
, replicate the data to all vector lanes and save into dst
.
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 16; i++) {
+ dst.half[i] = data;
+}
+
+__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.w xr, r, imm
+CPU Flags: LASX
+
+Read 32-bit data from memory address addr + (offset << 2)
, replicate the data to all vector lanes and save into dst
.
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 8; i++) {
+ dst.word[i] = data;
+}
+
+__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.d xr, r, imm
+CPU Flags: LASX
+
+Read 64-bit data from memory address addr + (offset << 3)
, replicate the data to all vector lanes and save into dst
.
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 4; i++) {
+ dst.dword[i] = data;
+}
+
+void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvst xr, r, imm
+CPU Flags: LASX
+
+Write whole vector data in data
to memory address addr + offset
.
memory_store(256, data, addr + offset);
+
+void __lasx_xvstx (__m256i data, void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvstx xr, r, r
+CPU Flags: LASX
+
+Write whole-vector data in data
to memory address addr + offset
.
memory_store(256, data, addr + offset);
+
+void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.b xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 8-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(8, data.byte[lane], addr + offset);
+
+void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.h xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 16-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(16, data.half[lane], addr + offset);
+
+void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.w xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 32-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(32, data.word[lane], addr + offset);
+
+void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.d xr, r, imm, imm
+CPU Flags: LASX
+
+Store the 64-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(64, data.dword[lane], addr + offset);
+
+
+ __m256i __lasx_vext2xv_h_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.h.b xr, xr
+CPU Flags: LSX
+
+Extend signed 8-bit lane of a
to signed 16-bit elements.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_hu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.hu.bu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 8-bit lane of a
to unsigned 16-bit elements.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_w_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.b xr, xr
+CPU Flags: LSX
+
+Extend signed 8-bit lane of a
to signed 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s8)a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_wu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.bu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 8-bit lane of a
to unsigned 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u8)a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_w_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.h xr, xr
+CPU Flags: LSX
+
+Extend signed 16-bit lane of a
to signed 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_wu_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.hu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 16-bit lane of a
to unsigned 32-bit elements.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_d_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.b xr, xr
+CPU Flags: LSX
+
+Extend signed 8-bit lane of a
to signed 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s8)a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_du_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.bu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 8-bit lane of a
to unsigned 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u8)a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +3.96 | +
__m256i __lasx_vext2xv_d_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.h xr, xr
+CPU Flags: LSX
+
+Extend signed 16-bit lane of a
to signed 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_du_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.hu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 16-bit lane of a
to unsigned 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_d_w (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.w xr, xr
+CPU Flags: LSX
+
+Extend signed 32-bit lane of a
to signed 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_vext2xv_du_wu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.wu xr, xr
+CPU Flags: LSX
+
+Extend unsigned 32-bit lane of a
to unsigned 64-bit elements.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_xvilvh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.b xr, xr, xr
+CPU Flags: LASX
+
+Interleave 8-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+for (; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvilvh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.h xr, xr, xr
+CPU Flags: LASX
+
+Interleave 16-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvilvh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.w xr, xr, xr
+CPU Flags: LASX
+
+Interleave 32-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvilvh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.d xr, xr, xr
+CPU Flags: LASX
+
+Interleave 64-bit elements in higher half of a
and b
.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvilvl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.b xr, xr, xr
+CPU Flags: LASX
+
+Interleave 8-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+for (; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvilvl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.h xr, xr, xr
+CPU Flags: LASX
+
+Interleave 16-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+for (; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvilvl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.w xr, xr, xr
+CPU Flags: LASX
+
+Interleave 32-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+for (; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvilvl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.d xr, xr, xr
+CPU Flags: LASX
+
+Interleave 64-bit elements in lower half of a
and b
.
int i;
+for (i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+for (; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.w xr, r, imm
+CPU Flags: LASX
+
+Insert 32-bit element into lane indexed imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.d xr, r, imm
+CPU Flags: LASX
+
+Insert 64-bit element into lane indexed imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.w xr, xr, imm
+CPU Flags: LASX
+
+Insert the first 32-bit lane of b
into lane indexed imm
of a
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == imm) ? b.word[0] : a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.d xr, xr, imm
+CPU Flags: LASX
+
+Insert the first 64-bit lane of b
into lane indexed imm
of a
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +3.96 | +
__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.b xr, xr, xr
+CPU Flags: LASX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[c.byte[0] % 16] = i;
+for (i = 16; i < 32; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[(c.byte[16] % 16) + 16] = i - 16;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.h xr, xr, xr
+CPU Flags: LASX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[c.half[0] % 8] = i;
+for (i = 8; i < 16; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[(c.half[8] % 8) + 8] = i - 8;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.b xr, xr, imm
+CPU Flags: LASX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[imm % 16] = i;
+for (i = 16; i < 32; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[(imm % 16) + 16] = i - 16;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.h xr, xr, imm
+CPU Flags: LASX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[imm % 8] = i;
+for (i = 8; i < 16; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[(imm % 8) + 8] = i - 8;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvmskgez_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskgez.b xr, xr
+CPU Flags: LASX
+
+For each 8-bit element in a
, if the element is greater than or equal to zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmskltz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.b xr, xr
+CPU Flags: LASX
+
+For each 8-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[3] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmskltz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.h xr, xr
+CPU Flags: LASX
+
+For each 16-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] |= c << 4;
+dst.dword[3] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmskltz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.w xr, xr
+CPU Flags: LASX
+
+For each 32-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] |= c << 2;
+dst.dword[3] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmskltz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.d xr, xr
+CPU Flags: LASX
+
+For each 64-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c >>= 63;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c >>= 63;
+dst.dword[2] |= c << 1;
+dst.dword[3] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvmsknz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmsknz.b xr, xr
+CPU Flags: LASX
+
+For each 8-bit element in a
, if the element is non-zero, set one bit in dst
, otherwise clear it.
u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = ~(((a.dword[2] & m) + m) | a.dword[2] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = ~(((a.dword[3] & m) + m) | a.dword[3] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.b xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.h xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.w xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.d xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack even-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.b xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.h xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.w xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpackod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.d xr, xr, xr
+CPU Flags: LASX
+
+Collect and pack odd-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +3.92 | +
__m256i __lasx_xvpickev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.b xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 8-bit elements in b
first, then pick even-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpickev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.h xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 16-bit elements in b
first, then pick even-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpickev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.w xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 32-bit elements in b
first, then pick even-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpickev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.d xr, xr, xr
+CPU Flags: LASX
+
+Pick even-positioned 64-bit elements in b
first, then pick even-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+Copy one 32-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 3 | +4 | +
__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+Copy one 64-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 3 | +4 | +
__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+Copy one 32-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 3 | +4 | +
__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+Copy one 64-bit lane from a
specified by imm
to the first lane of dst
, and set the other lanes to zero.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 3 | +4 | +
int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.w r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s32)a.word[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.wu r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u32)a.word[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.d r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s64)a.dword[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.du r, xr, imm
+CPU Flags: LASX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u64)a.dword[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvpickod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.b xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 8-bit elements in b
first, then pick odd-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpickod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.h xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 16-bit elements in b
first, then pick odd-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpickod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.w xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 32-bit elements in b
first, then pick odd-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpickod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.d xr, xr, xr
+CPU Flags: LASX
+
+Pick odd-positioned 64-bit elements in b
first, then pick odd-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvrepli_b (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvrepli_h (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvrepli_w (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvrepli_d (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = imm;
+}
+
+Tested on real machine.
+__m256i __lasx_xvreplgr2vr_b (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.b xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplgr2vr_h (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.h xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplgr2vr_w (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.w xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplgr2vr_d (long int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.d xr, r
+CPU Flags: LASX
+
+Repeat val
to whole vector.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplve_b (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.b xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx % 16];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = a.byte[(idx % 16) + 16];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplve_h (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.h xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx % 8];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = a.half[(idx % 8) + 8];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplve_w (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.w xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx % 4];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = a.word[(idx % 4) + 4];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplve_d (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.d xr, xr, r
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx % 2];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = a.dword[(idx % 2) + 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m256i __lasx_xvreplve0_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.b xr, xr
+CPU Flags: LASX
+
+Repeat the first 8-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[0];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_xvreplve0_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.h xr, xr
+CPU Flags: LASX
+
+Repeat the first 16-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[0];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_xvreplve0_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.w xr, xr
+CPU Flags: LASX
+
+Repeat the first 32-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[0];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_xvreplve0_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.d xr, xr
+CPU Flags: LASX
+
+Repeat the first 64-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[0];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_xvreplve0_q (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.q xr, xr
+CPU Flags: LASX
+
+Repeat the first 128-bit lane from a
to all lanes of dst
.
for (int i = 0; i < 2; i++) {
+ dst.qword[i] = a.qword[0];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +3.96 | +
__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.b xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx];
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = a.byte[idx + 16];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.h xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx];
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = a.half[idx + 8];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.w xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx];
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = a.word[idx + 4];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.d xr, xr, imm
+CPU Flags: LASX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx];
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = a.dword[idx + 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.b xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.bu xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.h xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.hu xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.w xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.wu xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.d xr, xr, imm
+CPU Flags: LASX
+
+Clamp signed 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.du xr, xr, imm
+CPU Flags: LASX
+
+Clamp unsigned 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.b xr, xr, xr
+CPU Flags: LASX
+
+If the 8-bit element in a
equals to zero, set the result to zero. If the signed 8-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] =
+ (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.h xr, xr, xr
+CPU Flags: LASX
+
+If the 16-bit element in a
equals to zero, set the result to zero. If the signed 16-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 16; i++) {
+ dst.half[i] =
+ (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.w xr, xr, xr
+CPU Flags: LASX
+
+If the 32-bit element in a
equals to zero, set the result to zero. If the signed 32-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 8; i++) {
+ dst.word[i] =
+ (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.d xr, xr, xr
+CPU Flags: LASX
+
+If the 64-bit element in a
equals to zero, set the result to zero. If the signed 64-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] =
+ (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvldi (imm_n1024_1023 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+Initialize dst
using predefined patterns:
imm[12:10]=0b000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:10]=0b001
: broadcast sign-extended imm[9:0]
as 16-bit elements to all lanesimm[12:10]=0b010
: broadcast sign-extended imm[9:0]
as 32-bit elements to all lanesimm[12:10]=0b011
: broadcast sign-extended imm[9:0]
as 64-bit elements to all lanesimm[12:8]=0b10000
: broadcast imm[7:0]
as 32-bit elements to all lanesimm[12:8]=0b10001
: broadcast imm[7:0] << 8
as 32-bit elements to all lanesimm[12:8]=0b10010
: broadcast imm[7:0] << 16
as 32-bit elements to all lanesimm[12:8]=0b10011
: broadcast imm[7:0] << 24
as 32-bit elements to all lanesimm[12:8]=0b10100
: broadcast imm[7:0]
as 16-bit elements to all lanesimm[12:8]=0b10101
: broadcast imm[7:0] << 8
as 16-bit elements to all lanesimm[12:8]=0b10110
: broadcast (imm[7:0] << 8) | 0xFF
as 32-bit elements to all lanesimm[12:8]=0b10111
: broadcast (imm[7:0] << 16) | 0xFFFF
as 32-bit elements to all lanesimm[12:8]=0b11000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:8]=0b11001
: repeat each bit of imm[7:0]
eight times, and broadcast the result as 64-bit elements to all lanesimm[12:8]=0b11010
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 32-bit elements to all lanesimm[12:8]=0b11011
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 64-bit elements to all lanesimm[12:8]=0b11100
: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)
as 64-bit elements to all lanesu64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+ broadcast_value = simm9_0;
+ broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+ broadcast_value = simm9_0;
+ broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+ broadcast_value = simm9_0;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+ broadcast_value = imm7_0 << 16;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+ broadcast_value = imm7_0 << 24;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+ broadcast_value = imm7_0;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+ broadcast_value = (imm7_0 << 8) | 0xFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+ broadcast_value = (imm7_0 << 16) | 0xFFFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+ broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+ imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+ imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+ imm7 * 0xFF00000000000000;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+ broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+ (imm5_0 << 48);
+ broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+ for (int i = 0; i < 32; i++) {
+ dst.byte[i] = broadcast_value;
+ }
+} else if (broadcast_width == 16) {
+ for (int i = 0; i < 16; i++) {
+ dst.half[i] = broadcast_value;
+ }
+} else if (broadcast_width == 32) {
+ for (int i = 0; i < 8; i++) {
+ dst.word[i] = broadcast_value;
+ }
+} else if (broadcast_width == 64) {
+ for (int i = 0; i < 4; i++) {
+ dst.dword[i] = broadcast_value;
+ }
+}
+
+
+ __m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.w xr, xr, imm
+CPU Flags: LASX
+
+Permute words from a
and b
with indices recorded in imm
and store into dst
.
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+dst.word[4] = b.word[4 + (imm & 0x3)];
+dst.word[5] = b.word[4 + ((imm >> 2) & 0x3)];
+dst.word[6] = a.word[4 + ((imm >> 4) & 0x3)];
+dst.word[7] = a.word[4 + ((imm >> 6) & 0x3)];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+Permute double words from a
and b
with indices recorded in imm
and store into dst
.
dst.dword[0] = a.dword[imm & 0x3];
+dst.dword[1] = a.dword[(imm >> 2) & 0x3];
+dst.dword[2] = a.dword[(imm >> 4) & 0x3];
+dst.dword[3] = a.dword[(imm >> 6) & 0x3];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 3 | +4 | +
__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+Permute quad words from a
and b
with indices recorded in imm
and store into dst
.
if ((imm & 0x4) && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.qword[0] = 0;
+} else {
+ dst.qword[0] = (imm & 2) ? a.qword[imm & 0x1] : b.qword[imm & 0x1];
+}
+if ((imm & 0x80) && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.qword[1] = 0;
+} else {
+ dst.qword[1] =
+ (imm & 0x20) ? a.qword[(imm >> 4) & 0x1] : b.qword[(imm >> 4) & 0x1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 3 | +4 | +
__m256i __lasx_xvperm_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvperm.w xr, xr, xr
+CPU Flags: LASX
+
+Permute words from a
with indices recorded in b
and store into dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[b.word[i] % 0x8];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsll.v xr, xr, imm
+CPU Flags: LASX
+
+Compute whole vector a
shifted left by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+dst.qword[1] = (u128)a.qword[1] << shift;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsrl.v xr, xr, imm
+CPU Flags: LASX
+
+Compute whole vector a
shifted right by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+dst.qword[1] = (u128)a.qword[1] >> shift;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsll_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.b xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsll_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.h xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsll_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.w xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsll_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.d xr, xr, xr
+CPU Flags: LASX
+
+Logical left shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.b xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.h xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.w xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.d xr, xr, imm
+CPU Flags: LASX
+
+Logical left shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.h.b xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift signed 8-bit elements in a
by imm
to signed 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i + 8] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.52, 2 | +1.93 | +
__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.hu.bu xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift unsigned 8-bit elements in a
by imm
to unsigned 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i + 8] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.w.h xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift signed 16-bit elements in a
by imm
to signed 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (s32)(s16)a.half[i + 4] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.52, 2 | +1.95 | +
__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.wu.hu xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift unsigned 16-bit elements in a
by imm
to unsigned 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (u32)(u16)a.half[i + 4] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.d.w xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift signed 32-bit elements in a
by imm
to signed 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i + 2] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.du.wu xr, xr, imm
+CPU Flags: LASX
+
+Extend and shift unsigned 32-bit elements in a
by imm
to unsigned 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i + 2] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m256i __lasx_xvsra_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.b xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsra_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsra_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsra_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.b xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (s8)((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] =
+ (i < 12) ? (s16)((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] =
+ (i < 6) ? (s32)((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (s8)((s16)b.half[i - 8] >> imm)
+ : (s8)((s16)a.half[i - 16] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? (s16)((s32)b.word[i - 4] >> imm)
+ : (s16)((s32)a.word[i - 8] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+ : (s32)((s64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? (s32)((s64)b.dword[i - 2] >> imm)
+ : (s32)((s64)a.dword[i - 4] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+ : (s64)((s128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? (s64)((s128)b.qword[i - 1] >> imm)
+ : (s64)((s128)a.qword[i - 2] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrar_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.b xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+ (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrar_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+ (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrar_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+ (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrar_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.b xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.h xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] =
+ ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.w xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] =
+ ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.d xr, xr, imm
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] =
+ ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+ (((s16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u8 shift = (b.half[i - 8] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 8] >> shift) +
+ (((s16)a.half[i - 8] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (s16)(s32)a.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+ (((s32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u8 shift = (b.word[i - 4] & 31);
+ if (shift == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 4] >> shift) +
+ (((s32)a.word[i - 4] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+ (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u8 shift = (b.dword[i - 2] & 63);
+ if (shift == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 2] >> shift) +
+ (((s64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+ (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)b.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)b.half[i - 8] >> imm) +
+ (((s16)b.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 16];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 16] >> imm) +
+ (((s16)a.half[i - 16] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)b.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+ (((s32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+ (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)b.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)b.word[i - 4] >> imm) +
+ (((s32)b.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 8];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 8] >> imm) +
+ (((s32)a.word[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)b.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+ (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)b.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)b.dword[i - 2] >> imm) +
+ (((s64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 4];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 4] >> imm) +
+ (((s64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)b.qword[i];
+ } else {
+ dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+ (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)b.qword[i - 1];
+ } else {
+ dst.dword[i] = (s64)(((s128)b.qword[i - 1] >> imm) +
+ (((s128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)a.qword[i - 2];
+ } else {
+ dst.dword[i] = (s64)(((s128)a.qword[i - 2] >> imm) +
+ (((s128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.b xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsrl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsrl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsrl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.b xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = a.dword[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (u8)((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] =
+ (i < 12) ? (u16)((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] =
+ (i < 6) ? (u32)((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+ dst.byte[i] = (i < 24) ? (u8)((u16)b.half[i - 8] >> imm)
+ : (u8)((u16)a.half[i - 16] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+ dst.half[i] = (i < 12) ? (u16)((u32)b.word[i - 4] >> imm)
+ : (u16)((u32)a.word[i - 8] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+ : (u32)((u64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+ dst.word[i] = (i < 6) ? (u32)((u64)b.dword[i - 2] >> imm)
+ : (u32)((u64)a.dword[i - 4] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+ : (u64)((u128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+ dst.dword[i] = (i < 3) ? (u64)((u128)b.qword[i - 1] >> imm)
+ : (u64)((u128)a.qword[i - 2] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.b xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+ ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+ ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+ ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+ ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.b xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+ (((u16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u8 shift = (b.half[i - 8] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 8] >> shift) +
+ (((u16)a.half[i - 8] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (u16)(u32)a.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+ (((u32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u8 shift = (b.word[i - 4] & 31);
+ if (shift == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 4] >> shift) +
+ (((u32)a.word[i - 4] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+ (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u8 shift = (b.dword[i - 2] & 63);
+ if (shift == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 2] >> shift) +
+ (((u64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+ (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)b.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)b.half[i - 8] >> imm) +
+ (((u16)b.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 16];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 16] >> imm) +
+ (((u16)a.half[i - 16] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)b.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+ (((u32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+ (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)b.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)b.word[i - 4] >> imm) +
+ (((u32)b.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 8];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 8] >> imm) +
+ (((u32)a.word[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)b.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+ (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)b.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)b.dword[i - 2] >> imm) +
+ (((u64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 4];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 4] >> imm) +
+ (((u64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)b.qword[i];
+ } else {
+ dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+ (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)b.qword[i - 1];
+ } else {
+ dst.dword[i] = (u64)(((u128)b.qword[i - 1] >> imm) +
+ (((u128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)a.qword[i - 2];
+ } else {
+ dst.dword[i] = (u64)(((u128)a.qword[i - 2] >> imm) +
+ (((u128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp = (s16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp = (s16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp = (s16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp = (s32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp = (s32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp = (s32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp = (s64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp = (s64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp = (s64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp = (s128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp = (s128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.du.q xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp = (s128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp = (s128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i - 8];
+ } else {
+ temp =
+ ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 16];
+ } else {
+ temp = ((s16)a.half[i - 16] >> imm) +
+ (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i - 8];
+ } else {
+ temp =
+ ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 16];
+ } else {
+ temp = ((s16)a.half[i - 16] >> imm) +
+ (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i - 4];
+ } else {
+ temp =
+ ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 8];
+ } else {
+ temp =
+ ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i - 4];
+ } else {
+ temp =
+ ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 8];
+ } else {
+ temp =
+ ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i - 2];
+ } else {
+ temp = ((s64)b.dword[i - 2] >> imm) +
+ (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 4];
+ } else {
+ temp = ((s64)a.dword[i - 4] >> imm) +
+ (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i - 2];
+ } else {
+ temp = ((s64)b.dword[i - 2] >> imm) +
+ (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 4];
+ } else {
+ temp = ((s64)a.dword[i - 4] >> imm) +
+ (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i - 1];
+ } else {
+ temp = ((s128)b.qword[i - 1] >> imm) +
+ (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 2];
+ } else {
+ temp = ((s128)a.qword[i - 2] >> imm) +
+ (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.du.q xr, xr, imm
+CPU Flags: LASX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i - 1];
+ } else {
+ temp = ((s128)b.qword[i - 1] >> imm) +
+ (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 2];
+ } else {
+ temp = ((s128)a.qword[i - 2] >> imm) +
+ (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp = (u16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp = (u16)b.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp = (u16)a.half[i - 16] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp = (u32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp = (u32)b.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp = (u32)a.word[i - 8] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp = (u64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp = (u64)b.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp = (u64)a.dword[i - 4] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp = (u128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp = (u128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.du.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp = (u128)b.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp = (u128)a.qword[i - 2] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if ((b.half[i - 8] & 15) == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+ (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if ((b.word[i - 4] & 31) == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+ (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if ((b.dword[i - 2] & 63) == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+ (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i - 8];
+ } else {
+ temp =
+ ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 16];
+ } else {
+ temp = ((u16)a.half[i - 16] >> imm) +
+ (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+for (int i = 16; i < 32; i++) {
+ if (i < 24) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i - 8];
+ } else {
+ temp =
+ ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 16];
+ } else {
+ temp = ((u16)a.half[i - 16] >> imm) +
+ (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i - 4];
+ } else {
+ temp =
+ ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 8];
+ } else {
+ temp =
+ ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+for (int i = 8; i < 16; i++) {
+ if (i < 12) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i - 4];
+ } else {
+ temp =
+ ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 8];
+ } else {
+ temp =
+ ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i - 2];
+ } else {
+ temp = ((u64)b.dword[i - 2] >> imm) +
+ (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 4];
+ } else {
+ temp = ((u64)a.dword[i - 4] >> imm) +
+ (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+for (int i = 4; i < 8; i++) {
+ if (i < 6) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i - 2];
+ } else {
+ temp = ((u64)b.dword[i - 2] >> imm) +
+ (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 4];
+ } else {
+ temp = ((u64)a.dword[i - 4] >> imm) +
+ (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i - 1];
+ } else {
+ temp = ((u128)b.qword[i - 1] >> imm) +
+ (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 2];
+ } else {
+ temp = ((u128)a.qword[i - 2] >> imm) +
+ (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.du.q xr, xr, imm
+CPU Flags: LASX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+for (int i = 2; i < 4; i++) {
+ if (i < 3) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i - 1];
+ } else {
+ temp = ((u128)b.qword[i - 1] >> imm) +
+ (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 2];
+ } else {
+ temp = ((u128)a.qword[i - 2] >> imm) +
+ (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m256i __lasx_xvrotr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.b xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] =
+ (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvrotr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.h xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+ (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvrotr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.w xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+ (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvrotr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.d xr, xr, xr
+CPU Flags: LASX
+
+Rotate right the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+ (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.b xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.h xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.w xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.d xr, xr, imm
+CPU Flags: LASX
+
+Rotate right the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.b xr, xr, xr, xr
+CPU Flags: LASX
+
+Shuffle bytes from a
and b
with indices from c
.
Caveat: the indices are placed in c
, while in other vshuf
intrinsics, they are placed in a
.
for (int i = 0; i < 32; i++) {
+ if ((c.byte[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.byte[i] = 0;
+ } else if ((c.byte[i] % 32) < 16) {
+ dst.byte[i] = b.byte[(c.byte[i] % 32) + ((i >= 16) ? 16 : 0)];
+ } else {
+ dst.byte[i] = a.byte[(c.byte[i] % 32) + ((i >= 16) ? 0 : -16)];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.h xr, xr, xr
+CPU Flags: LASX
+
+Shuffle 16-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.half[i] = 0;
+ } else if ((a.half[i] % 16) < 8) {
+ dst.half[i] = c.half[(a.half[i] % 16) + ((i >= 8) ? 8 : 0)];
+ } else {
+ dst.half[i] = b.half[(a.half[i] % 16) + ((i >= 8) ? 0 : -8)];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.w xr, xr, xr
+CPU Flags: LASX
+
+Shuffle 32-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.word[i] = 0;
+ } else if ((a.word[i] % 8) < 4) {
+ dst.word[i] = c.word[(a.word[i] % 8) + ((i >= 4) ? 4 : 0)];
+ } else {
+ dst.word[i] = b.word[(a.word[i] % 8) + ((i >= 4) ? 0 : -4)];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.d xr, xr, xr
+CPU Flags: LASX
+
+Shuffle 64-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.dword[i] = 0;
+ } else if ((a.dword[i] % 4) < 2) {
+ dst.dword[i] = c.dword[(a.dword[i] % 4) + ((i >= 2) ? 2 : 0)];
+ } else {
+ dst.dword[i] = b.dword[(a.dword[i] % 4) + ((i >= 2) ? 0 : -2)];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.b xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 8-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 32; i++) {
+ dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.h xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 16-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.w xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 32-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.d xr, xr, imm
+CPU Flags: LASX
+
+Shuffle every four 64-bit elements in a
and b
with indices packed in imm
, save the result to dst
.
dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+ (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+dst.dword[2] = (imm & 2) ? b.dword[(imm & 1) + 2] : a.dword[(imm & 1) + 2];
+dst.dword[3] =
+ (imm & 8) ? b.dword[((imm >> 2) & 1) + 2] : a.dword[((imm >> 2) & 1) + 2];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vbitsel.v vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in c
equals to one, copy the bit from b
to dst
, otherwise copy from a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vbitseli.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in a
equals to one, copy the bit from imm
to dst
, otherwise copy from b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.b vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.h vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.w vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.d vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.b vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.h vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.w vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.d vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitset_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.b vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitset_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.h vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitset_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.w vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitset_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.d vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.b vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.h vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.w vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.d vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.b vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.h vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.w vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.d vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.b vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.h vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.w vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.d vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vclo_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.b vr, vr
+CPU Flags: LSX
+
+Count leading ones of 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clo(a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vclo_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.h vr, vr
+CPU Flags: LSX
+
+Count leading ones of 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clo(a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vclo_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.w vr, vr
+CPU Flags: LSX
+
+Count leading ones of 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clo(a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vclo_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.d vr, vr
+CPU Flags: LSX
+
+Count leading ones of 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clo(a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vclz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.b vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clz(a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vclz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.h vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clz(a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vclz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.w vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clz(a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vclz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.d vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clz(a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vexth_h_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.h.b vr, vr
+CPU Flags: LSX
+
+Extend signed 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vexth_hu_bu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.hu.bu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vexth_w_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.w.h vr, vr
+CPU Flags: LSX
+
+Extend signed 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vexth_wu_hu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.wu.hu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vexth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.d.w vr, vr
+CPU Flags: LSX
+
+Extend signed 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vexth_du_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.du.wu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vexth_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vexth_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vextl_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vextl_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.b vr, vr, imm
+CPU Flags: LSX
+
+Extract one 8-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.h vr, vr, imm
+CPU Flags: LSX
+
+Extract one 16-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.w vr, vr, imm
+CPU Flags: LSX
+
+Extract one 32-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.d vr, vr, imm
+CPU Flags: LSX
+
+Extract one 64-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpcnt_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.b vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = popcount(a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vpcnt_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.h vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = popcount(a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vpcnt_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.w vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = popcount(a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vpcnt_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.d vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = popcount(a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
int __lsx_bz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vseteqz.v fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if the whole vector a
equals to zero.
dst = a.qword[0] == 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bnz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetnez.v fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if the whole vector a
is non-zero.
dst = a.qword[0] != 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.b fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 8-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 16; i++) {
+ if (a.byte[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.h fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 16-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 8; i++) {
+ if (a.half[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.w fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 32-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 4; i++) {
+ if (a.word[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.d fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 64-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 2; i++) {
+ if (a.dword[i] == 0) {
+ dst = 1;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bnz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.b fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 8-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 16; i++) {
+ if (a.byte[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bnz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.h fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 16-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 8; i++) {
+ if (a.half[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bnz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.w fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 32-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 4; i++) {
+ if (a.word[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
int __lsx_bnz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.d fcc, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 64-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 2; i++) {
+ if (a.dword[i] == 0) {
+ dst = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5 | +2 | +
__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.92 | +
__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.9 | +
__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.95 | +
__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.96 | +
__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128 __lsx_vfadd_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfadd.s vr, vr, vr
+CPU Flags: LSX
+
+Add single precision floating point elements in a
to elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m128d __lsx_vfadd_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add double precision floating point elements in a
to elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfdiv.s vr, vr, vr
+CPU Flags: LSX
+
+Divide single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18 | +
__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 21.48 | +0.25 | +
__m128 __lsx_vfmax_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of single precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128d __lsx_vfmax_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of double precision floating point elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.s vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.d vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128 __lsx_vfmin_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of single precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128d __lsx_vfmin_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of double precision floating point elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128 __lsx_vfmina_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmina.s vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128d __lsx_vfmina_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmina.d vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128 __lsx_vfmul_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmul.s vr, vr, vr
+CPU Flags: LSX
+
+Multiply single precision floating point elements in a
and elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128d __lsx_vfmul_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmul.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply double precision floating point elements in a
and elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128 __lsx_vfsub_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfsub.s vr, vr, vr
+CPU Flags: LSX
+
+Subtract single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m128d __lsx_vfsub_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfsub.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +4 | +
__m128 __lsx_vflogb_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vflogb.s vr, vr
+CPU Flags: LSX
+
+Compute 2-based logarithm of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = log2(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128d __lsx_vflogb_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vflogb.d vr, vr
+CPU Flags: LSX
+
+Compute 2-based logarithm of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = log2(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128 __lsx_vfsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.s vr, vr
+CPU Flags: LSX
+
+Compute square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.08 | +
__m128d __lsx_vfsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.d vr, vr
+CPU Flags: LSX
+
+Compute square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +36 | +0.06 | +
__m128 __lsx_vfrsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.s vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17 | +0.05 | +
__m128d __lsx_vfrsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.d vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +15 | +0.04 | +
__m128 __lsx_vfrecip_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecip.s vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1 / a.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18 | +
__m128d __lsx_vfrecip_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecip.d vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1 / a.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25 | +
__m128 __lsx_vfrsqrte_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.s vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+__m128d __lsx_vfrsqrte_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.d vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+__m128 __lsx_vfrecipe_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.s vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+__m128d __lsx_vfrecipe_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.d vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+
+ __m128d __lsx_vfcvth_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvth.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in higher half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[2 + i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128d __lsx_vfcvtl_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in lower half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcvt.s.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double precision floating point elements in a
and b
to double precision.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ dst.fp32[i] = b.fp64[i];
+ } else {
+ dst.fp32[i] = a.fp64[i - 2];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128 __lsx_vfcvth_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvth.s.h vr, vr
+CPU Flags: LSX
+
+Convert half precision floating point elements in higher half of a
to single precision.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp16[4 + i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128 __lsx_vfcvtl_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.s.h vr, vr
+CPU Flags: LSX
+
+Convert half precision floating point elements in lower half of a
to single precision.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp16[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcvt.h.s vr, vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in a
and b
to half precision.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ dst.fp16[i] = b.fp32[i];
+ } else {
+ dst.fp16[i] = a.fp32[i - 4];
+ }
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128d __lsx_vffinth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffinth.d.w vr, vr
+CPU Flags: LSX
+
+Convert 32-bit integer elements in higher part of a
to double precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128d __lsx_vffintl_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffintl.d.w vr, vr
+CPU Flags: LSX
+
+Convert 32-bit integer elements in lower part of a
to double precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128d __lsx_vffint_d_l (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.l vr, vr
+CPU Flags: LSX
+
+Convert signed 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128d __lsx_vffint_d_lu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.lu vr, vr
+CPU Flags: LSX
+
+Convert unsigned 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128 __lsx_vffint_s_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.w vr, vr
+CPU Flags: LSX
+
+Convert signed 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128 __lsx_vffint_s_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.wu vr, vr
+CPU Flags: LSX
+
+Convert unsigned 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128 __lsx_vffint_s_l (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vffint.s.l vr, vr, vr
+CPU Flags: LSX
+
+Convert 64-bit integer elements in a
and b
to double-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] =
+ (i < 2) ? (f32)(s32)a.dword[i]
+ : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftinth_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftinth.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrml_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrml.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrmh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrmh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrpl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrpl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrph_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrph.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrzl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrzh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrnel_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrnel.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrneh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrneh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftint_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftint_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrm_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrm.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrm_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrp_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrp.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrp_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrz_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrz_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrne_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrne.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrne_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftint_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.lu.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftint_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.wu.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrz_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.lu.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftintrz_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.wu.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +4 | +
__m128i __lsx_vftint_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftint.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vfclass_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfclass.d vr, vr
+CPU Flags: LSX
+
+Classifiy each double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vfclass_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfclass.s vr, vr
+CPU Flags: LSX
+
+Classifiy each single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128 __lsx_vfrint_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrint.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128d __lsx_vfrint_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrint.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128 __lsx_vfrintrp_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128d __lsx_vfrintrp_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128 __lsx_vfrintrm_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128d __lsx_vfrintrm_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128 __lsx_vfrintrz_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128d __lsx_vfrintrz_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128 __lsx_vfrintrne_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128d __lsx_vfrintrne_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +5 | +2 | +
__m128i __lsx_vseq_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the 8-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vseq_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the 16-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vseq_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the 32-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vseq_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the 64-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the 8-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the 16-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the 32-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the 64-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslt_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vslt_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.bu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vslt_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vslt_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.hu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vslt_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vslt_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.wu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vslt_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vslt_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.du vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.bu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.hu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.wu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.du vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vsle_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsle_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.bu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsle_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsle_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.hu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsle_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsle_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.wu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsle_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vsle_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.du vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.bu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.hu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.wu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.du vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+
+Add 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+
+Add 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+
+Add 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+
+Add 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] + b.qword[0];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+
+Add 8-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+
+Add 16-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+
+Add 32-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+
+Add 64-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vavgr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavgr_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavgr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavgr_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavgr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +3.91 | +
__m128i __lsx_vavgr_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vavgr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vavgr_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vdiv_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.b vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 32 | +0.06 | +
__m128i __lsx_vdiv_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.bu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 33 | +0.06 | +
__m128i __lsx_vdiv_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.h vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17 | +0.12 | +
__m128i __lsx_vdiv_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.hu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 22 | +0.11 | +
__m128i __lsx_vdiv_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.w vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18 | +
__m128i __lsx_vdiv_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.wu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11 | +0.18 | +
__m128i __lsx_vdiv_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25 | +
__m128i __lsx_vdiv_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.du vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8 | +0.25 | +
__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
to even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
to even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
to even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
to even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
to even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
to even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
to even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
to even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 8-bit elements in a
by even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 8-bit elements in a
by even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 16-bit elements in a
by even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 16-bit elements in a
by even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 32-bit elements in a
by even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 32-bit elements in a
by even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 64-bit elements in a
by even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 64-bit elements in a
by even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+ (u32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+ (u64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+ (u128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +1.14 | +
__m128i __lsx_vmax_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmax_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmax_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmax_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmax_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmax_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmax_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.82 | +
__m128i __lsx_vmax_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +3.95 | +
__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vmin_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmin_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmin_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +3.96 | +
__m128i __lsx_vmin_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmin_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmin_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmin_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vmin_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +4 | +
__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 2 | +4 | +
__m128i __lsx_vmod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.b vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 35 | +0.06 | +
__m128i __lsx_vmod_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.bu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +29, 37 | +0.06 | +
__m128i __lsx_vmod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.h vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 21 | +0.12 | +
__m128i __lsx_vmod_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.hu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +17, 21 | +0.11 | +
__m128i __lsx_vmod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.w vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 13 | +0.18 | +
__m128i __lsx_vmod_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.wu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +11, 13 | +0.18 | +
__m128i __lsx_vmod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.d vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25 | +
__m128i __lsx_vmod_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.du vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +8, 10 | +0.25 | +
__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmuh_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmul_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmul_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] * b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmul_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] * b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmul_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +7 | +2 | +
__m128i __lsx_vneg_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.b vr, vr
+CPU Flags: LSX
+
+Negate 8-bit elements in a
and save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = -a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vneg_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.h vr, vr
+CPU Flags: LSX
+
+Negate 16-bit elements in a
and save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = -a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vneg_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.w vr, vr
+CPU Flags: LSX
+
+Negate 32-bit elements in a
and save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = -a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vneg_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.d vr, vr
+CPU Flags: LSX
+
+Negate 64-bit elements in a
and save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = -a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.b vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.bu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.h vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.hu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.w vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.wu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.d vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsadd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.du vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.b vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.bu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.h vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.hu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.w vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.wu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.d vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vssub_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.du vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +3.66 | +
__m128i __lsx_vsub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] - b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] - b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsub_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.q vr, vr, vr
+CPU Flags: LSX
+
+Subtract 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] - b.qword[0];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.bu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 8-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.hu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 16-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.wu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 32-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.du vr, vr, imm
+CPU Flags: LSX
+
+Subtract 64-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] - imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ANDN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vnor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vnor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vnori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vorn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vorn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ORN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vxor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vxor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vxori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+
+Read whole vector from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldx (void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+
+Read whole vector from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+
+Read 8-bit data from memory address addr + (offset << 0)
, replicate the data to all vector lanes and save into dst
.
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = data;
+}
+
+__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+
+Read 16-bit data from memory address addr + (offset << 1)
, replicate the data to all vector lanes and save into dst
.
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = data;
+}
+
+__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+
+Read 32-bit data from memory address addr + (offset << 2)
, replicate the data to all vector lanes and save into dst
.
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = data;
+}
+
+__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+
+Read 64-bit data from memory address addr + (offset << 3)
, replicate the data to all vector lanes and save into dst
.
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = data;
+}
+
+void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vst vr, r, imm
+CPU Flags: LSX
+
+Write whole vector data in data
to memory address addr + offset
.
memory_store(128, data, addr + offset);
+
+void __lsx_vstx (__m128i data, void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vstx vr, r, r
+CPU Flags: LSX
+
+Write whole-vector data in data
to memory address addr + offset
.
memory_store(128, data, addr + offset);
+
+void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.b vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 8-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(8, data.byte[lane], addr + offset);
+
+void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.h vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 16-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(16, data.half[lane], addr + offset);
+
+void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.w vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 32-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(32, data.word[lane], addr + offset);
+
+void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.d vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 64-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(64, data.dword[lane], addr + offset);
+
+
+ __m128i __lsx_vilvh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in higher half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vilvh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in higher half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vilvh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in higher half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vilvh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in higher half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vilvl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in lower half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vilvl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in lower half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vilvl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in lower half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vilvl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in lower half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.b vr, r, imm
+CPU Flags: LSX
+
+Insert 8-bit element into lane indexed imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == imm) ? b : a.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.h vr, r, imm
+CPU Flags: LSX
+
+Insert 16-bit element into lane indexed imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == imm) ? b : a.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.w vr, r, imm
+CPU Flags: LSX
+
+Insert 32-bit element into lane indexed imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.d vr, r, imm
+CPU Flags: LSX
+
+Insert 64-bit element into lane indexed imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.b vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[c.byte[0] % 16] = i;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.h vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[c.half[0] % 8] = i;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.b vr, vr, imm
+CPU Flags: LSX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[imm % 16] = i;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.h vr, vr, imm
+CPU Flags: LSX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[imm % 8] = i;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vmskgez_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskgez.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is greater than or equal to zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmskltz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmskltz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.h vr, vr
+CPU Flags: LSX
+
+For each 16-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmskltz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.w vr, vr
+CPU Flags: LSX
+
+For each 32-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmskltz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.d vr, vr
+CPU Flags: LSX
+
+For each 64-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vmsknz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmsknz.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is non-zero, set one bit in dst
, otherwise clear it.
u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.b vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.h vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.w vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.d vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.b vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.h vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.w vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpackod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.d vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpickev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.b vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 8-bit elements in b
first, then pick even-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpickev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.h vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 16-bit elements in b
first, then pick even-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpickev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.w vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 32-bit elements in b
first, then pick even-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpickev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.d vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 64-bit elements in b
first, then pick even-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.b r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s8)a.byte[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.bu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u8)a.byte[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.h r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s16)a.half[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.hu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u16)a.half[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.w r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s32)a.word[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.wu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u32)a.word[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.d r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (s64)a.dword[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.du r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = (u64)a.dword[idx];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vpickod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.b vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 8-bit elements in b
first, then pick odd-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpickod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.h vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 16-bit elements in b
first, then pick odd-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpickod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.w vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 32-bit elements in b
first, then pick odd-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vpickod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.d vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 64-bit elements in b
first, then pick odd-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vrepli_b (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_h (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_w (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_d (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vreplgr2vr_b (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.b vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplgr2vr_h (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.h vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplgr2vr_w (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.w vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplgr2vr_d (long int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.d vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = val;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplve_b (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.b vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx % 16];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplve_h (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.h vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx % 8];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplve_w (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.w vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx % 4];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplve_d (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.d vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx % 2];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +1 | +
__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.b vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.h vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.w vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.d vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.b vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.bu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.h vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.hu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.w vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.wu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.d vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.du vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.b vr, vr, vr
+CPU Flags: LSX
+
+If the 8-bit element in a
equals to zero, set the result to zero. If the signed 8-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.h vr, vr, vr
+CPU Flags: LSX
+
+If the 16-bit element in a
equals to zero, set the result to zero. If the signed 16-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.w vr, vr, vr
+CPU Flags: LSX
+
+If the 32-bit element in a
equals to zero, set the result to zero. If the signed 32-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.d vr, vr, vr
+CPU Flags: LSX
+
+If the 64-bit element in a
equals to zero, set the result to zero. If the signed 64-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vldi (imm_n1024_1023 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Initialize dst
using predefined patterns:
imm[12:10]=0b000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:10]=0b001
: broadcast sign-extended imm[9:0]
as 16-bit elements to all lanesimm[12:10]=0b010
: broadcast sign-extended imm[9:0]
as 32-bit elements to all lanesimm[12:10]=0b011
: broadcast sign-extended imm[9:0]
as 64-bit elements to all lanesimm[12:8]=0b10000
: broadcast imm[7:0]
as 32-bit elements to all lanesimm[12:8]=0b10001
: broadcast imm[7:0] << 8
as 32-bit elements to all lanesimm[12:8]=0b10010
: broadcast imm[7:0] << 16
as 32-bit elements to all lanesimm[12:8]=0b10011
: broadcast imm[7:0] << 24
as 32-bit elements to all lanesimm[12:8]=0b10100
: broadcast imm[7:0]
as 16-bit elements to all lanesimm[12:8]=0b10101
: broadcast imm[7:0] << 8
as 16-bit elements to all lanesimm[12:8]=0b10110
: broadcast (imm[7:0] << 8) | 0xFF
as 32-bit elements to all lanesimm[12:8]=0b10111
: broadcast (imm[7:0] << 16) | 0xFFFF
as 32-bit elements to all lanesimm[12:8]=0b11000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:8]=0b11001
: repeat each bit of imm[7:0]
eight times, and broadcast the result as 64-bit elements to all lanesimm[12:8]=0b11010
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 32-bit elements to all lanesimm[12:8]=0b11011
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 64-bit elements to all lanesimm[12:8]=0b11100
: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)
as 64-bit elements to all lanesu64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+ broadcast_value = simm9_0;
+ broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+ broadcast_value = simm9_0;
+ broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+ broadcast_value = simm9_0;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+ broadcast_value = imm7_0 << 16;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+ broadcast_value = imm7_0 << 24;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+ broadcast_value = imm7_0;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+ broadcast_value = (imm7_0 << 8) | 0xFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+ broadcast_value = (imm7_0 << 16) | 0xFFFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+ broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+ imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+ imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+ imm7 * 0xFF00000000000000;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+ broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+ (imm5_0 << 48);
+ broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+ for (int i = 0; i < 16; i++) {
+ dst.byte[i] = broadcast_value;
+ }
+} else if (broadcast_width == 16) {
+ for (int i = 0; i < 8; i++) {
+ dst.half[i] = broadcast_value;
+ }
+} else if (broadcast_width == 32) {
+ for (int i = 0; i < 4; i++) {
+ dst.word[i] = broadcast_value;
+ }
+} else if (broadcast_width == 64) {
+ for (int i = 0; i < 2; i++) {
+ dst.dword[i] = broadcast_value;
+ }
+}
+
+Tested on real machine.
+ +__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+
+Permute words from a
and b
with indices recorded in imm
and store into dst
.
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsll.v vr, vr, imm
+CPU Flags: LSX
+
+Compute whole vector a
shifted left by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsrl.v vr, vr, imm
+CPU Flags: LSX
+
+Compute whole vector a
shifted right by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsll_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.b vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsll_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.h vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsll_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.w vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsll_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.d vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +3.59 | +
__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vslli.b vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vslli.h vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslli.w vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vslli.d vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.h.b vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 8-bit elements in a
by imm
to signed 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.hu.bu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 8-bit elements in a
by imm
to unsigned 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.w.h vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 16-bit elements in a
by imm
to signed 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.wu.hu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 16-bit elements in a
by imm
to unsigned 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.d.w vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 32-bit elements in a
by imm
to signed 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.du.wu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 32-bit elements in a
by imm
to unsigned 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 2 | +2 | +
__m128i __lsx_vsra_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.b vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsra_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsra_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsra_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.b vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+ : (s32)((s64)a.dword[i - 2] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+ : (s64)((s128)a.qword[i - 1] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrar_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.b vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+ (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrar_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+ (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrar_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+ (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrar_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.b vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] =
+ ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] =
+ ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] =
+ ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+ (((s16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (s16)(s32)a.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+ (((s32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+ (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+ (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)b.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+ (((s32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+ (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)b.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+ (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)b.qword[i];
+ } else {
+ dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+ (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.b vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsrl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsrl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsrl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.b vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] >> imm;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +2 | +2 | +
__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+ : (u32)((u64)a.dword[i - 2] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+ : (u64)((u128)a.qword[i - 1] >> imm);
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrlr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.b vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+ ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrlr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+ ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrlr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+ ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrlr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+ ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.b vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.5, 3 | +2 | +
__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+ (((u16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (u16)(u32)a.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+ (((u32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+ (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+ (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)b.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+ (((u32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+ (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)b.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+ (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)b.qword[i];
+ } else {
+ dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+ (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.du.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +4 | +2 | +
__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +3 | +2 | +
__m128i __lsx_vrotr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.b vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vrotr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.h vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+ (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vrotr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.w vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+ (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vrotr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.d vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+ (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +
__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.b vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.h vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.w vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.d vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+
+Shuffle bytes from a
and b
with indices from c
.
Caveat: the indices are placed in c
, while in other vshuf
intrinsics, they are placed in a
.
for (int i = 0; i < 16; i++) {
+ if (c.byte[i] >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.byte[i] = 0;
+ } else if ((c.byte[i] % 32) < 16) {
+ dst.byte[i] = b.byte[c.byte[i] % 16];
+ } else {
+ dst.byte[i] = a.byte[c.byte[i] % 16];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 16-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.half[i] = 0;
+ } else if ((a.half[i] % 16) < 8) {
+ dst.half[i] = c.half[a.half[i] % 8];
+ } else {
+ dst.half[i] = b.half[a.half[i] % 8];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 32-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.word[i] = 0;
+ } else if ((a.word[i] % 8) < 4) {
+ dst.word[i] = c.word[a.word[i] % 4];
+ } else {
+ dst.word[i] = b.word[a.word[i] % 4];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 64-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.dword[i] = 0;
+ } else if ((a.dword[i] % 4) < 2) {
+ dst.dword[i] = c.dword[a.dword[i] % 2];
+ } else {
+ dst.dword[i] = b.dword[a.dword[i] % 2];
+ }
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +2 | +
__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.b vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 8-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.h vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 16-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.w vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 32-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +0.25, 1 | +4 | +
__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.d vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 64-bit elements in a
and b
with indices packed in imm
, save the result to dst
.
dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+ (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+
+Tested on real machine.
+Architecture | +Latency | +Throughput (CPI) | +
---|---|---|
3A6000 | +1 | +4 | +