404
+ +Page not found
+ + +diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..baedcead --- /dev/null +++ b/404.html @@ -0,0 +1,139 @@ + + +
+ + + + +Page not found
+ + +This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:
+__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vbitsel.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in c
equals to one, copy the bit from b
to dst
, otherwise copy from a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vbitseli.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in a
equals to one, copy the bit from imm
to dst
, otherwise copy from b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.b vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.h vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.w vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.d vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.b vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.h vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.w vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.d vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.b vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.h vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.w vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.d vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.b vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.h vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.w vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.d vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.b vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.h vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.w vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.d vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.b vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.h vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.w vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.d vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vclo_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.b vr, vr
+CPU Flags: LSX
+
+Count leading ones of 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clo(a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vclo_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.h vr, vr
+CPU Flags: LSX
+
+Count leading ones of 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clo(a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vclo_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.w vr, vr
+CPU Flags: LSX
+
+Count leading ones of 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clo(a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vclo_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.d vr, vr
+CPU Flags: LSX
+
+Count leading ones of 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clo(a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vclz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.b vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clz(a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vclz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.h vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clz(a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vclz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.w vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clz(a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vclz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.d vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clz(a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_h_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.h.b vr, vr
+CPU Flags: LSX
+
+Extend signed 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_hu_bu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.hu.bu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_w_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.w.h vr, vr
+CPU Flags: LSX
+
+Extend signed 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_wu_hu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.wu.hu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.d.w vr, vr
+CPU Flags: LSX
+
+Extend signed 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_du_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.du.wu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextl_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextl_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.b vr, vr, imm
+CPU Flags: LSX
+
+Extract one 8-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.h vr, vr, imm
+CPU Flags: LSX
+
+Extract one 16-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.w vr, vr, imm
+CPU Flags: LSX
+
+Extract one 32-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.d vr, vr, imm
+CPU Flags: LSX
+
+Extract one 64-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vpcnt_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.b vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = popcount(a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vpcnt_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.h vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = popcount(a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vpcnt_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.w vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = popcount(a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vpcnt_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.d vr, vr
+CPU Flags: LSX
+
+Count the number of ones (population, popcount) in 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = popcount(a.dword[i]);
+}
+
+Tested on real machine.
+ +int __lsx_bz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vseteqz.v vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if the 128-bit vector a
equals to zero.
dst = a.qword[0] == 0;
+
+int __lsx_bnz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetnez.v vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if the 128-bit vector a
is non-zero.
dst = a.qword[0] != 0;
+
+int __lsx_bz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.b vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 8-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 16; i++) {
+ if (a.byte[i] == 0) {
+ dst = 1;
+ }
+}
+
+int __lsx_bz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.h vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 16-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 8; i++) {
+ if (a.half[i] == 0) {
+ dst = 1;
+ }
+}
+
+int __lsx_bz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.w vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 32-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 4; i++) {
+ if (a.word[i] == 0) {
+ dst = 1;
+ }
+}
+
+int __lsx_bz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.d vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if any 64-bit element in a
equals to zero.
dst = 0;
+for (int i = 0; i < 2; i++) {
+ if (a.dword[i] == 0) {
+ dst = 1;
+ }
+}
+
+int __lsx_bnz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.b vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 8-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 16; i++) {
+ if (a.byte[i] == 0) {
+ dst = 0;
+ }
+}
+
+int __lsx_bnz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.h vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 16-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 8; i++) {
+ if (a.half[i] == 0) {
+ dst = 0;
+ }
+}
+
+int __lsx_bnz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.w vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 32-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 4; i++) {
+ if (a.word[i] == 0) {
+ dst = 0;
+ }
+}
+
+int __lsx_bnz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.d vr, vr; bcnez
+CPU Flags: LSX
+
+Expected to be used in branches: branch if all 64-bit elements in a
are non-zero.
dst = 1;
+for (int i = 0; i < 2; i++) {
+ if (a.dword[i] == 0) {
+ dst = 0;
+ }
+}
+
+
+ __m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 4; i++) {
+ if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+
+__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0; i < 2; i++) {
+ if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+ dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.dword[i] = 0;
+ }
+
+
+ __m128 __lsx_vfadd_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfadd.s vr, vr, vr
+CPU Flags: LSX
+
+Add single precision floating point elements in a
to elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+Tested on real machine.
+__m128d __lsx_vfadd_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add double precision floating point elements in a
to elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+Tested on real machine.
+__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfdiv.s vr, vr, vr
+CPU Flags: LSX
+
+Divide single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+__m128 __lsx_vfmax_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of single precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+__m128d __lsx_vfmax_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of double precision floating point elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.s vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.d vr, vr, vr
+CPU Flags: LSX
+
+Compute maximum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+__m128 __lsx_vfmin_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of single precision floating point elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+__m128d __lsx_vfmin_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of double precision floating point elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+__m128 __lsx_vfmina_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmina.s vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of single precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+__m128d __lsx_vfmina_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmina.d vr, vr, vr
+CPU Flags: LSX
+
+Compute minimum of double precision floating point elements in a
and b
by magnitude.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+__m128 __lsx_vfmul_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmul.s vr, vr, vr
+CPU Flags: LSX
+
+Multiply single precision floating point elements in a
and elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+__m128d __lsx_vfmul_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmul.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply double precision floating point elements in a
and elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+__m128 __lsx_vfsub_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfsub.s vr, vr, vr
+CPU Flags: LSX
+
+Subtract single precision floating point elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+Tested on real machine.
+__m128d __lsx_vfsub_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfsub.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract double precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+Tested on real machine.
+__m128 __lsx_vflogb_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vflogb.s vr, vr
+CPU Flags: LSX
+
+Compute 2-based logarithm of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = log2(a.fp32[i]);
+}
+
+__m128d __lsx_vflogb_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vflogb.d vr, vr
+CPU Flags: LSX
+
+Compute 2-based logarithm of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = log2(a.fp64[i]);
+}
+
+__m128 __lsx_vfsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.s vr, vr
+CPU Flags: LSX
+
+Compute square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+__m128d __lsx_vfsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.d vr, vr
+CPU Flags: LSX
+
+Compute square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+__m128 __lsx_vfrsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.s vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+__m128d __lsx_vfrsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.d vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+__m128 __lsx_vfrecip_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecip.s vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1 / a.fp32[i];
+}
+
+__m128d __lsx_vfrecip_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecip.d vr, vr
+CPU Flags: LSX
+
+Compute reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1 / a.fp64[i];
+}
+
+__m128 __lsx_vfrsqrte_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.s vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of square root of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+__m128d __lsx_vfrsqrte_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.d vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of square root of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+__m128 __lsx_vfrecipe_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.s vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+__m128d __lsx_vfrecipe_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.d vr, vr
+CPU Flags: LSX
+
+Compute estimated reciprocal of double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+
+ __m128d __lsx_vfcvth_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvth.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in higher half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[2 + i];
+}
+
+Tested on real machine.
+__m128d __lsx_vfcvtl_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in lower half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[i];
+}
+
+Tested on real machine.
+__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcvt.s.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double precision floating point elements in a
and b
to double precision.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ dst.fp32[i] = b.fp64[i];
+ } else {
+ dst.fp32[i] = a.fp64[i - 2];
+ }
+}
+
+Tested on real machine.
+__m128 __lsx_vfcvth_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvth.s.h vr, vr
+CPU Flags: LSX
+
+Convert half precision floating point elements in higher half of a
to single precision.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp16[4 + i];
+}
+
+__m128 __lsx_vfcvtl_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.s.h vr, vr
+CPU Flags: LSX
+
+Convert half precision floating point elements in lower half of a
to single precision.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp16[i];
+}
+
+__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcvt.h.s vr, vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in a
and b
to half precision.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ dst.fp16[i] = b.fp32[i];
+ } else {
+ dst.fp16[i] = a.fp32[i - 4];
+ }
+}
+
+__m128d __lsx_vffinth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffinth.d.w vr, vr
+CPU Flags: LSX
+
+Convert 32-bit integer elements in higher part of a
to double precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vffintl_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffintl.d.w vr, vr
+CPU Flags: LSX
+
+Convert 32-bit integer elements in lower part of a
to double precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vffint_d_l (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.l vr, vr
+CPU Flags: LSX
+
+Convert signed 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vffint_d_lu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.lu vr, vr
+CPU Flags: LSX
+
+Convert unsigned 64-bit integer elements in a
to double-precision floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+__m128 __lsx_vffint_s_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.w vr, vr
+CPU Flags: LSX
+
+Convert signed 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+__m128 __lsx_vffint_s_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.wu vr, vr
+CPU Flags: LSX
+
+Convert unsigned 32-bit integer elements in a
to single-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+__m128 __lsx_vffint_s_l (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vffint.s.l vr, vr, vr
+CPU Flags: LSX
+
+Convert 64-bit integer elements in a
and b
to double-precision floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] =
+ (i < 2) ? (f32)(s32)a.dword[i]
+ : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+Tested on real machine.
+__m128i __lsx_vftinth_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftinth.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+Tested on real machine.
+__m128i __lsx_vftintrml_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrml.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrmh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrmh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrpl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrpl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrph_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrph.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrzl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzl.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrzh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrnel_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrnel.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in lower part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrneh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrneh.l.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in higher part of a
to 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftint_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftint_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrm_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrm.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrm_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrp_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrp.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrp_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrz_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrz_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrne_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrne.l.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to signed 64-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrne_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to signed 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftint_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.lu.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftint_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.wu.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrz_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.lu.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
to unsigned 64-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrz_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.wu.s vr, vr
+CPU Flags: LSX
+
+Convert single-precision floating point elements in a
to unsigned 32-bit integer, rounding towards zero.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftint_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftint.w.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, using current rounding mode specified in fscr
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards negative infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards positive infinity.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards zero.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.d vr, vr
+CPU Flags: LSX
+
+Convert double-precision floating point elements in a
and b
to 32-bit integer, rounding towards nearest even.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1)
+ ? (s64)a.fp64[i]
+ : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+
+ __m128i __lsx_vfclass_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfclass.d vr, vr
+CPU Flags: LSX
+
+Classifiy each double precision floating point elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+__m128i __lsx_vfclass_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfclass.s vr, vr
+CPU Flags: LSX
+
+Classifiy each single precision floating point elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+__m128 __lsx_vfrint_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrint.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vfrint_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrint.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, using current rounding mode specified in fscr
, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128 __lsx_vfrintrp_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vfrintrp_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards positive infinity, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128 __lsx_vfrintrm_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vfrintrm_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards negative infinity, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128 __lsx_vfrintrz_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vfrintrz_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards zero, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+__m128 __lsx_vfrintrne_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.s vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+__m128d __lsx_vfrintrne_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.d vr, vr
+CPU Flags: LSX
+
+Round single-precision floating point elements in a
to integers, rounding towards nearest even, and store as floating point numbers.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+
+ __m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmadd.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmsub.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+
+ __m128i __lsx_vseq_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the 8-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vseq_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the 16-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vseq_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the 32-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vseq_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the 64-bit elements in a
and b
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the 8-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the 16-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the 32-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the 64-bit elements in a
and imm
, store all-ones to dst
if equal, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.bu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.hu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.wu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslt_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.du vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.bu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.hu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.wu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.du vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and imm
, store all-ones to dst
if corresponding element in a
is less than b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.b vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.bu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.h vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.hu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.w vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.wu vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsle_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.du vr, vr, vr
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.b vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.bu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 8-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.h vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.hu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 16-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.w vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.wu vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 32-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.d vr, vr, imm
+CPU Flags: LSX
+
+Compare the signed 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.du vr, vr, imm
+CPU Flags: LSX
+
+Compare the unsigned 64-bit elements in a
and b
, store all-ones to dst
if corresponding element in a
is less than or equal b
, zero otherwise.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+Tested on real machine.
+ +__m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+
+Add 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+
+Add 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+
+Add 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+
+Add 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] + b.qword[0];
+
+Tested on real machine.
+__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+
+Add 8-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+
+Add 16-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+
+Add 32-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+
+Add 64-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] & b.byte[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] & b.half[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] & b.word[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] & b.dword[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.b vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.bu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.h vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.hu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.w vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.wu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vdiv_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.du vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
to even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
to even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
to even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
to even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
to even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
to even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
to even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
to even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 8-bit elements in a
by even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 8-bit elements in a
by even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 16-bit elements in a
by even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 16-bit elements in a
by even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 32-bit elements in a
by even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 32-bit elements in a
by even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 64-bit elements in a
by even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 64-bit elements in a
by even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+ (u32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+ (u64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+ (u128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.b vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.bu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.h vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.hu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.w vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.wu vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.d vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmod_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.du vr, vr, vr
+CPU Flags: LSX
+
+Modulo residual unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 8-bit elements in a
and b
, save the high 8-bit result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 16-bit elements in a
and b
, save the high 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 32-bit elements in a
and b
, save the high 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply signed 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+__m128i __lsx_vmuh_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply unsigned 64-bit elements in a
and b
, save the high 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] * b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] * b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vneg_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.b vr, vr
+CPU Flags: LSX
+
+Negate 8-bit elements in a
and save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = -a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vneg_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.h vr, vr
+CPU Flags: LSX
+
+Negate 16-bit elements in a
and save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = -a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vneg_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.w vr, vr
+CPU Flags: LSX
+
+Negate 32-bit elements in a
and save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = -a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vneg_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.d vr, vr
+CPU Flags: LSX
+
+Negate 64-bit elements in a
and save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = -a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.b vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.bu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.h vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.hu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.w vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.wu vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.d vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsadd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.du vr, vr, vr
+CPU Flags: LSX
+
+Saturing add the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.b vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.bu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 8-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.h vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.hu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 16-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.w vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.wu vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 32-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.d vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the signed 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vssub_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.du vr, vr, vr
+CPU Flags: LSX
+
+Saturing subtract the unsigned 64-bit elements in a
and b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] - b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] - b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.q vr, vr, vr
+CPU Flags: LSX
+
+Subtract 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] - b.qword[0];
+
+Tested on real machine.
+__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.bu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 8-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] - imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.hu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 16-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] - imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.wu vr, vr, imm
+CPU Flags: LSX
+
+Subtract 32-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] - imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.du vr, vr, imm
+CPU Flags: LSX
+
+Subtract 64-bit elements in a
by imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] - imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+Tested on real machine.
+ +__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ANDN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vnor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vnor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vnori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vorn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vorn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ORN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vxor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vxor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vxori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ imm;
+}
+
+Tested on real machine.
+ +__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+
+Read 128-bit data from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldx (void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+
+Read 128-bit data from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+
+Read 8-bit data from memory address addr + (offset << 0)
, replicate the data to all vector lanes and save into dst
.
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = data;
+}
+
+__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+
+Read 16-bit data from memory address addr + (offset << 1)
, replicate the data to all vector lanes and save into dst
.
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = data;
+}
+
+__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+
+Read 32-bit data from memory address addr + (offset << 2)
, replicate the data to all vector lanes and save into dst
.
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = data;
+}
+
+__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+
+Read 64-bit data from memory address addr + (offset << 3)
, replicate the data to all vector lanes and save into dst
.
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = data;
+}
+
+void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vst vr, r, imm
+CPU Flags: LSX
+
+Write 128-bit data in data
to memory address addr + offset
.
memory_store(128, data, addr + offset);
+
+void __lsx_vstx (__m128i data, void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vstx vr, r, r
+CPU Flags: LSX
+
+Write 128-bit data in data
to memory address addr + offset
.
memory_store(128, data, addr + offset);
+
+void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.b vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 8-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(8, data.byte[lane], addr + offset);
+
+void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.h vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 16-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(16, data.half[lane], addr + offset);
+
+void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.w vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 32-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(32, data.word[lane], addr + offset);
+
+void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.d vr, r, imm, imm
+CPU Flags: LSX
+
+Store the 64-bit element in data
specified by lane
to memory address addr + offset
.
memory_store(64, data.dword[lane], addr + offset);
+
+
+ __m128i __lsx_vilvh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in higher half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in higher half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in higher half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in higher half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in lower half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in lower half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in lower half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in lower half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.b vr, r, imm
+CPU Flags: LSX
+
+Insert 8-bit element into lane indexed imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == imm) ? b : a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.h vr, r, imm
+CPU Flags: LSX
+
+Insert 16-bit element into lane indexed imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == imm) ? b : a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.w vr, r, imm
+CPU Flags: LSX
+
+Insert 32-bit element into lane indexed imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.d vr, r, imm
+CPU Flags: LSX
+
+Insert 64-bit element into lane indexed imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.b vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[c.byte[0] % 16] = i;
+
+Tested on real machine.
+__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.h vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by c
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[c.half[0] % 8] = i;
+
+Tested on real machine.
+__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.b vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+ if ((s8)b.byte[i] < 0) {
+ break;
+ }
+}
+dst.byte[imm % 16] = i;
+
+Tested on real machine.
+__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.h vr, vr, vr
+CPU Flags: LSX
+
+Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+ if ((s16)b.half[i] < 0) {
+ break;
+ }
+}
+dst.half[imm % 8] = i;
+
+Tested on real machine.
+__m128i __lsx_vmskgez_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskgez.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is greater than or equal to zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+Tested on real machine.
+__m128i __lsx_vmskltz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+Tested on real machine.
+__m128i __lsx_vmskltz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.h vr, vr
+CPU Flags: LSX
+
+For each 16-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+Tested on real machine.
+__m128i __lsx_vmskltz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.w vr, vr
+CPU Flags: LSX
+
+For each 32-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+Tested on real machine.
+__m128i __lsx_vmskltz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.d vr, vr
+CPU Flags: LSX
+
+For each 64-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+Tested on real machine.
+__m128i __lsx_vmsknz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmsknz.b vr, vr
+CPU Flags: LSX
+
+For each 8-bit element in a
, if the element is non-zero, set one bit in dst
, otherwise clear it.
u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+Tested on real machine.
+__m128i __lsx_vpackev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.b vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vpackev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.h vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vpackev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.w vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vpackev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.d vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack even-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vpackod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.b vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 8-bit elements in a
and b
and store dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vpackod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.h vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 16-bit elements in a
and b
and store dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vpackod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.w vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 32-bit elements in a
and b
and store dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vpackod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.d vr, vr, vr
+CPU Flags: LSX
+
+Collect and pack odd-positioned 64-bit elements in a
and b
and store dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vpickev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.b vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 8-bit elements in b
first, then pick even-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vpickev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.h vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 16-bit elements in b
first, then pick even-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vpickev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.w vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 32-bit elements in b
first, then pick even-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vpickev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.d vr, vr, vr
+CPU Flags: LSX
+
+Pick even-positioned 64-bit elements in b
first, then pick even-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+
+Tested on real machine.
+int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.b r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.byte[idx];
+
+unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.bu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.byte[idx];
+
+int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.h r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.half[idx];
+
+unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.hu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.half[idx];
+
+int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.w r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.word[idx];
+
+unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.wu r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.word[idx];
+
+long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.d r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.dword[idx];
+
+unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.du r, vr, imm
+CPU Flags: LSX
+
+Pick the lane
specified by idx
from a
and store into dst
.
dst = a.dword[idx];
+
+__m128i __lsx_vpickod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.b vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 8-bit elements in b
first, then pick odd-positioned 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vpickod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.h vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 16-bit elements in b
first, then pick odd-positioned 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vpickod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.w vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 32-bit elements in b
first, then pick odd-positioned 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vpickod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.d vr, vr, vr
+CPU Flags: LSX
+
+Pick odd-positioned 64-bit elements in b
first, then pick odd-positioned 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_b (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi.b vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_h (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi.h vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_w (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi.w vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vrepli_d (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi.d vr, imm
+CPU Flags: LSX
+
+Repeat imm
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vreplgr2vr_b (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.b vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = val;
+}
+
+Tested on real machine.
+__m128i __lsx_vreplgr2vr_h (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.h vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = val;
+}
+
+Tested on real machine.
+__m128i __lsx_vreplgr2vr_w (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.w vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = val;
+}
+
+Tested on real machine.
+__m128i __lsx_vreplgr2vr_d (long int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.d vr, r
+CPU Flags: LSX
+
+Repeat val
to whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = val;
+}
+
+Tested on real machine.
+__m128i __lsx_vreplve_b (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.b vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx % 16];
+}
+
+Tested on real machine.
+__m128i __lsx_vreplve_h (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.h vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx % 8];
+}
+
+Tested on real machine.
+__m128i __lsx_vreplve_w (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.w vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx % 4];
+}
+
+Tested on real machine.
+__m128i __lsx_vreplve_d (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.d vr, vr, r
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx % 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.b vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[idx];
+}
+
+Tested on real machine.
+__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.h vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[idx];
+}
+
+Tested on real machine.
+__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.w vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[idx];
+}
+
+Tested on real machine.
+__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.d vr, vr, imm
+CPU Flags: LSX
+
+Repeat the element in lane idx
of a
to fill whole vector.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[idx];
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.b vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.bu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 8-bit elements in a
to range specified by imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.h vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.hu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 16-bit elements in a
to range specified by imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.w vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.wu vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 32-bit elements in a
to range specified by imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.d vr, vr, imm
+CPU Flags: LSX
+
+Clamp signed 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.du vr, vr, imm
+CPU Flags: LSX
+
+Clamp unsigned 64-bit elements in a
to range specified by imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.b vr, vr, vr
+CPU Flags: LSX
+
+If the 8-bit element in a
equals to zero, set the result to zero. If the signed 8-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.h vr, vr, vr
+CPU Flags: LSX
+
+If the 16-bit element in a
equals to zero, set the result to zero. If the signed 16-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.w vr, vr, vr
+CPU Flags: LSX
+
+If the 32-bit element in a
equals to zero, set the result to zero. If the signed 32-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.d vr, vr, vr
+CPU Flags: LSX
+
+If the 64-bit element in a
equals to zero, set the result to zero. If the signed 64-bit element in a
is posiive, copy element in b
to result. Otherwise, copy negated element in b
to result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vldi (imm_n1024_1023 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+Initialize dst
using predefined patterns:
imm[12:10]=0b000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:10]=0b001
: broadcast sign-extended imm[9:0]
as 16-bit elements to all lanesimm[12:10]=0b010
: broadcast sign-extended imm[9:0]
as 32-bit elements to all lanesimm[12:10]=0b011
: broadcast sign-extended imm[9:0]
as 64-bit elements to all lanesimm[12:8]=0b10000
: broadcast imm[7:0]
as 32-bit elements to all lanesimm[12:8]=0b10001
: broadcast imm[7:0] << 8
as 32-bit elements to all lanesimm[12:8]=0b10010
: broadcast imm[7:0] << 16
as 32-bit elements to all lanesimm[12:8]=0b10011
: broadcast imm[7:0] << 24
as 32-bit elements to all lanesimm[12:8]=0b10100
: broadcast imm[7:0]
as 16-bit elements to all lanesimm[12:8]=0b10101
: broadcast imm[7:0] << 8
as 16-bit elements to all lanesimm[12:8]=0b10110
: broadcast (imm[7:0] << 8) | 0xFF
as 32-bit elements to all lanesimm[12:8]=0b10111
: broadcast (imm[7:0] << 16) | 0xFFFF
as 32-bit elements to all lanesimm[12:8]=0b11000
: broadcast imm[7:0]
as 8-bit elements to all lanesimm[12:8]=0b11001
: repeat each bit of imm[7:0]
eight times, and broadcast the result as 64-bit elements to all lanesimm[12:8]=0b11010
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 32-bit elements to all lanesimm[12:8]=0b11011
: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 64-bit elements to all lanesimm[12:8]=0b11100
: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)
as 64-bit elements to all lanesu64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+ broadcast_value = simm9_0;
+ broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+ broadcast_value = simm9_0;
+ broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+ broadcast_value = simm9_0;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+ broadcast_value = imm7_0 << 16;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+ broadcast_value = imm7_0 << 24;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+ broadcast_value = imm7_0;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+ broadcast_value = imm7_0 << 8;
+ broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+ broadcast_value = (imm7_0 << 8) | 0xFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+ broadcast_value = (imm7_0 << 16) | 0xFFFF;
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+ broadcast_value = imm7_0;
+ broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+ broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+ imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+ imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+ imm7 * 0xFF00000000000000;
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+ broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+ (imm5_0 << 19);
+ broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+ broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+ (imm5_0 << 48);
+ broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+ for (int i = 0; i < 16; i++) {
+ dst.byte[i] = broadcast_value;
+ }
+} else if (broadcast_width == 16) {
+ for (int i = 0; i < 8; i++) {
+ dst.half[i] = broadcast_value;
+ }
+} else if (broadcast_width == 32) {
+ for (int i = 0; i < 4; i++) {
+ dst.word[i] = broadcast_value;
+ }
+} else if (broadcast_width == 64) {
+ for (int i = 0; i < 2; i++) {
+ dst.dword[i] = broadcast_value;
+ }
+}
+
+Tested on real machine.
+ +__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+
+Permute words from a
and b
with indices recorded in imm
and store into dst
.
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+
+Tested on real machine.
+ +__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsll.v vr, vr, imm
+CPU Flags: LSX
+
+Compute 128-bit a
shifted left by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+
+Tested on real machine.
+__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsrl.v vr, vr, imm
+CPU Flags: LSX
+
+Compute 128-bit a
shifted right by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+
+Tested on real machine.
+__m128i __lsx_vsll_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.b vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+__m128i __lsx_vsll_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.h vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+__m128i __lsx_vsll_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.w vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+__m128i __lsx_vsll_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.d vr, vr, vr
+CPU Flags: LSX
+
+Logical left shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vslli.b vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vslli.h vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslli.w vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vslli.d vr, vr, imm
+CPU Flags: LSX
+
+Logical left shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.h.b vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 8-bit elements in a
by imm
to signed 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.hu.bu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 8-bit elements in a
by imm
to unsigned 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.w.h vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 16-bit elements in a
by imm
to signed 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.wu.hu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 16-bit elements in a
by imm
to unsigned 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.d.w vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift signed 32-bit elements in a
by imm
to signed 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.du.wu vr, vr, imm
+CPU Flags: LSX
+
+Extend and shift unsigned 32-bit elements in a
by imm
to unsigned 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsra_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.b vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+__m128i __lsx_vsra_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+__m128i __lsx_vsra_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+__m128i __lsx_vsra_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.b vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+ : (s32)((s64)a.dword[i - 2] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+ : (s64)((s128)a.qword[i - 1] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrar_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.b vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+ (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrar_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+ (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrar_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+ (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrar_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.b vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.h vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] =
+ ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.w vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] =
+ ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.d vr, vr, imm
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] =
+ ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+ (((s16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (s16)(s32)a.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+ (((s32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithmetic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+ (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (s8)(s16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+ (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)b.word[i];
+ } else {
+ dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+ (((s32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (s16)(s32)a.word[i - 4];
+ } else {
+ dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+ (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)b.dword[i];
+ } else {
+ dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+ (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (s32)(s64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)b.qword[i];
+ } else {
+ dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+ (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (s64)(s128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.b vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.b vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] >> imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+ : (u32)((u64)a.dword[i - 2] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+ : (u64)((u128)a.qword[i - 1] >> imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.b vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if ((b.byte[i] & 0x7) == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+ ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((b.half[i] & 0xf) == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+ ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((b.word[i] & 0x1f) == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+ ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((b.dword[i] & 0x3f) == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+ ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.b vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (imm == 0) {
+ dst.byte[i] = a.byte[i];
+ } else {
+ dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (imm == 0) {
+ dst.half[i] = a.half[i];
+ } else {
+ dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (imm == 0) {
+ dst.word[i] = a.word[i];
+ } else {
+ dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (imm == 0) {
+ dst.dword[i] = a.dword[i];
+ } else {
+ dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u8 shift = (b.half[i] & 15);
+ if (shift == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+ (((u16)a.half[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u8 shift = (b.word[i] & 31);
+ if (shift == 0) {
+ dst.half[i] = (u16)(u32)a.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+ (((u32)a.word[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u8 shift = (b.dword[i] & 63);
+ if (shift == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+ (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+ }
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, truncate to 8-bit and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)b.half[i];
+ } else {
+ dst.byte[i] =
+ (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.byte[i] = (u8)(u16)a.half[i - 8];
+ } else {
+ dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+ (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, truncate to 16-bit and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)b.word[i];
+ } else {
+ dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+ (((u32)b.word[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.half[i] = (u16)(u32)a.word[i - 4];
+ } else {
+ dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+ (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, truncate to 32-bit and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)b.dword[i];
+ } else {
+ dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+ (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.word[i] = (u32)(u64)a.dword[i - 2];
+ } else {
+ dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, truncate to 64-bit and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)b.qword[i];
+ } else {
+ dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+ (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+ }
+ } else {
+ if (imm == 0) {
+ dst.dword[i] = (u64)(u128)a.qword[i - 1];
+ } else {
+ dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+ }
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp = (s16)b.half[i] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp = (s16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp = (s32)b.word[i] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp = (s32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp = (s64)b.dword[i] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp = (s64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.du.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp = (s128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp = (s128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (s16)a.half[i];
+ } else {
+ temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+ (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (s32)a.word[i];
+ } else {
+ temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+ (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (s64)a.dword[i];
+ } else {
+ temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, -128, 127);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)b.half[i];
+ } else {
+ temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ } else {
+ s16 temp;
+ if (imm == 0) {
+ temp = (s16)a.half[i - 8];
+ } else {
+ temp =
+ ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<s16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, -32768, 32767);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)b.word[i];
+ } else {
+ temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ } else {
+ s32 temp;
+ if (imm == 0) {
+ temp = (s32)a.word[i - 4];
+ } else {
+ temp =
+ ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<s32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)b.dword[i];
+ } else {
+ temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ } else {
+ s64 temp;
+ if (imm == 0) {
+ temp = (s64)a.dword[i - 2];
+ } else {
+ temp = ((s64)a.dword[i - 2] >> imm) +
+ (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Arithemtic right shift (with rounding) the signed 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)b.qword[i];
+ } else {
+ temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ } else {
+ s128 temp;
+ if (imm == 0) {
+ temp = (s128)a.qword[i - 1];
+ } else {
+ temp = ((s128)a.qword[i - 1] >> imm) +
+ (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp = (u16)b.half[i] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp = (u16)a.half[i - 8] >> imm;
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp = (u32)b.word[i] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp = (u32)a.word[i - 4] >> imm;
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp = (u64)b.dword[i] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp = (u64)a.dword[i - 2] >> imm;
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp = (u128)b.qword[i] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp = (u128)a.qword[i - 1] >> imm;
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
by elements in b
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if ((b.half[i] & 15) == 0) {
+ temp = (u16)a.half[i];
+ } else {
+ temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+ (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ dst.byte[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
by elements in b
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if ((b.word[i] & 31) == 0) {
+ temp = (u32)a.word[i];
+ } else {
+ temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+ (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ dst.half[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
by elements in b
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if ((b.dword[i] & 63) == 0) {
+ temp = (u64)a.dword[i];
+ } else {
+ temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+ (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in signed 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 127);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 16-bit elements in a
and b
by imm
, clamp to fit in unsigned 8-bit integer and store the result to dst
.
for (int i = 0; i < 16; i++) {
+ if (i < 8) {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)b.half[i];
+ } else {
+ temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ } else {
+ u16 temp;
+ if (imm == 0) {
+ temp = (u16)a.half[i - 8];
+ } else {
+ temp =
+ ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+ }
+ dst.byte[i] = clamp<u16>(temp, 0, 255);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in signed 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 32767);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 32-bit elements in a
and b
by imm
, clamp to fit in unsigned 16-bit integer and store the result to dst
.
for (int i = 0; i < 8; i++) {
+ if (i < 4) {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)b.word[i];
+ } else {
+ temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ } else {
+ u32 temp;
+ if (imm == 0) {
+ temp = (u32)a.word[i - 4];
+ } else {
+ temp =
+ ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+ }
+ dst.half[i] = clamp<u32>(temp, 0, 65535);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in signed 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 64-bit elements in a
and b
by imm
, clamp to fit in unsigned 32-bit integer and store the result to dst
.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)b.dword[i];
+ } else {
+ temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ } else {
+ u64 temp;
+ if (imm == 0) {
+ temp = (u64)a.dword[i - 2];
+ } else {
+ temp = ((u64)a.dword[i - 2] >> imm) +
+ (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+ }
+ dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in signed 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.du.q vr, vr, imm
+CPU Flags: LSX
+
+Logical right shift (with rounding) the unsigned 128-bit elements in a
and b
by imm
, clamp to fit in unsigned 64-bit integer and store the result to dst
.
for (int i = 0; i < 2; i++) {
+ if (i < 1) {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)b.qword[i];
+ } else {
+ temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ } else {
+ u128 temp;
+ if (imm == 0) {
+ temp = (u128)a.qword[i - 1];
+ } else {
+ temp = ((u128)a.qword[i - 1] >> imm) +
+ (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+ }
+ dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vrotr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.b vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 8-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+Tested on real machine.
+__m128i __lsx_vrotr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.h vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 16-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+ (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+Tested on real machine.
+__m128i __lsx_vrotr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.w vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 32-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+ (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+Tested on real machine.
+__m128i __lsx_vrotr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.d vr, vr, vr
+CPU Flags: LSX
+
+Rotate right the unsigned 64-bit elements in a
by elements in b
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+ (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+Tested on real machine.
+__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.b vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 8-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.h vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 16-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.w vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 32-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.d vr, vr, imm
+CPU Flags: LSX
+
+Rotate right the unsigned 64-bit elements in a
by imm
, store the result to dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+Tested on real machine.
+ +__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+
+Shuffle bytes from a
and b
with indices from c
.
Caveat: the indices are placed in c
, while in other vshuf
intrinsics, they are placed in a
.
for (int i = 0; i < 16; i++) {
+ if (c.byte[i] >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.byte[i] = 0;
+ } else if ((c.byte[i] % 32) < 16) {
+ dst.byte[i] = b.byte[c.byte[i] % 16];
+ } else {
+ dst.byte[i] = a.byte[c.byte[i] % 16];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 16-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.half[i] = 0;
+ } else if ((a.half[i] % 16) < 8) {
+ dst.half[i] = c.half[a.half[i] % 8];
+ } else {
+ dst.half[i] = b.half[a.half[i] % 8];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 32-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.word[i] = 0;
+ } else if ((a.word[i] % 8) < 4) {
+ dst.word[i] = c.word[a.word[i] % 4];
+ } else {
+ dst.word[i] = b.word[a.word[i] % 4];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 64-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.dword[i] = 0;
+ } else if ((a.dword[i] % 4) < 2) {
+ dst.dword[i] = c.dword[a.dword[i] % 2];
+ } else {
+ dst.dword[i] = b.dword[a.dword[i] % 2];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.b vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 8-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.h vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 16-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.w vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 32-bit elements in a
with indices packed in imm
, save the result to dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.d vr, vr, imm
+CPU Flags: LSX
+
+Shuffle every four 64-bit elements in a
and b
with indices packed in imm
, save the result to dst
.
dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+ (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+
+Tested on real machine.
+ +