404
+ +Page not found
+ + +diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..8df1f10a --- /dev/null +++ b/404.html @@ -0,0 +1,137 @@ + + +
+ + + + +Page not found
+ + +This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:
+__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vbitsel.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in c
equals to one, copy the bit from b
to dst
, otherwise copy from a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vbitseli.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in a
equals to one, copy the bit from imm
to dst
, otherwise copy from b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.b vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.h vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.w vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.d vr, vr, vr
+CPU Flags: LSX
+
+Clear the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.b vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.h vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.w vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.d vr, vr, imm
+CPU Flags: LSX
+
+Clear the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.b vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.h vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.w vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitset_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.d vr, vr, vr
+CPU Flags: LSX
+
+Set the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.b vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.h vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.w vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.d vr, vr, imm
+CPU Flags: LSX
+
+Set the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.b vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.h vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.w vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.d vr, vr, vr
+CPU Flags: LSX
+
+Toggle the bit specified by elements in b
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.b vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 8-bit elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.h vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 16-bit elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.w vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 32-bit elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.d vr, vr, imm
+CPU Flags: LSX
+
+Toggle the bit specified by imm
from 64-bit elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+Tested on real machine.
+__m128i __lsx_clo_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.b vr, vr
+CPU Flags: LSX
+
+Count leading ones of 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clo(a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_clo_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.h vr, vr
+CPU Flags: LSX
+
+Count leading ones of 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clo(a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_clo_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.w vr, vr
+CPU Flags: LSX
+
+Count leading ones of 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clo(a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_clo_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.d vr, vr
+CPU Flags: LSX
+
+Count leading ones of 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clo(a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_clz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.b vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = clz(a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_clz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.h vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = clz(a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_clz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.w vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = clz(a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_clz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.d vr, vr
+CPU Flags: LSX
+
+Count leading zeros of 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = clz(a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_h_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.h.b vr, vr
+CPU Flags: LSX
+
+Extend signed 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_hu_bu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.hu.bu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 8-bit elements in the higher half of a
to 16-bit.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_w_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.w.h vr, vr
+CPU Flags: LSX
+
+Extend signed 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_wu_hu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.wu.hu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 16-bit elements in the higher half of a
to 32-bit.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.d.w vr, vr
+CPU Flags: LSX
+
+Extend signed 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_du_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.du.wu vr, vr
+CPU Flags: LSX
+
+Extend unsigned 32-bit elements in the higher half of a
to 64-bit.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vexth_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the higher half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextl_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.q.d vr, vr
+CPU Flags: LSX
+
+Extend signed 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextl_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.qu.du vr, vr
+CPU Flags: LSX
+
+Extend unsigned 64-bit elements in the lower half of a
to 128-bit.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.b vr, vr, imm
+CPU Flags: LSX
+
+Extract one 8-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.h vr, vr, imm
+CPU Flags: LSX
+
+Extract one 16-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.w vr, vr, imm
+CPU Flags: LSX
+
+Extract one 32-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.d vr, vr, imm
+CPU Flags: LSX
+
+Extract one 64-bit element in b
and insert it to a
according to imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_pcnt_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.b vr, vr
+CPU Flags: LSX
+
+Count the number of ones in 8-bit elements in a
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = popcount(a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_pcnt_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.h vr, vr
+CPU Flags: LSX
+
+Count the number of ones in 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = popcount(a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_pcnt_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.w vr, vr
+CPU Flags: LSX
+
+Count the number of ones in 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = popcount(a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_pcnt_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.d vr, vr
+CPU Flags: LSX
+
+Count the number of ones in 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = popcount(a.dword[i]);
+}
+
+Tested on real machine.
+ +__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Do not trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.s vr, vr, vr
+CPU Flags: LSX
+
+Compare single precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 4;i++) {
+ if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.d vr, vr, vr
+CPU Flags: LSX
+
+Compare double precision elements in a
and b
, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst
. Trap for QNaN.
for (int i = 0;i < 2;i++) {
+ if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+
+ __m128d __lsx_vfadd_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add double precision floating point elements in a
to b
and store the result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+__m128d __lsx_vfadd_s (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfadd.s vr, vr, vr
+CPU Flags: LSX
+
+Add single precision floating point elements in a
to b
and store the result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfdiv.s vr, vr, vr
+CPU Flags: LSX
+
+Divide __m128 precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide __m128d precision floating point elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+
+ __m128d __lsx_vfcvth_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvth.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in higher half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[2 + i];
+}
+
+__m128d __lsx_vfcvtl_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.d.s vr, vr
+CPU Flags: LSX
+
+Convert single precision floating point elements in lower half of a
to double precision.
for (int i = 0; i < 2; i++) {
+ dst.fp64[i] = a.fp32[i];
+}
+
+__m128 __lsx_vfcvt_s_d (__m128a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcvt.s.d vr, vr, vr
+CPU Flags: LSX
+
+Convert double precision floating point elements in a
and b
to double precision.
for (int i = 0; i < 4; i++) {
+ if (i < 2) {
+ dst.fp32[i] = b.fp64[i];
+ } else {
+ dst.fp32[i] = a.fp64[i - 2];
+ }
+}
+
+
+ __m128i __lsx_vfclass_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfclass.d vr, vr
+CPU Flags: LSX
+
+Classifiy each double precision floating point elements in a
.
for (int i = 0;i < 2;i++) {
+ dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+__m128i __lsx_vfclass_s (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfclass.s vr, vr
+CPU Flags: LSX
+
+Classifiy each single precision floating point elements in a
.
for (int i = 0;i < 4;i++) {
+ dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+
+ __m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmadd.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmsub.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+
+ __m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+
+Add 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+
+Add 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+
+Add 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+
+Add 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] + b.qword[0];
+
+Tested on real machine.
+__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+ : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+ : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+ : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+ ? (a.dword[i] - b.dword[i])
+ : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+
+Add 8-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+
+Add 16-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+
+Add 32-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+
+Add 64-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) + (a.byte[i] & b.byte[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] =
+ ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) + (a.byte[i] & b.byte[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ (a.half[i] & b.half[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ (a.half[i] & b.half[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ (a.word[i] & b.word[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ (a.word[i] & b.word[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ (a.dword[i] & b.dword[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ (a.dword[i] & b.dword[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+ ((a.byte[i] | b.byte[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+ ((a.half[i] | b.half[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+ ((a.word[i] | b.word[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavgr_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+ ((a.dword[i] | b.dword[i]) & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_div_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.b vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : (s8)a.byte[i] / ((s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_div_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.bu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 8-bit elements in a
by elements in b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (b.byte[i] == 0) ? 0 : (u8)a.byte[i] / ((u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_div_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.h vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : (s16)a.half[i] / ((s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_div_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.hu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 16-bit elements in a
by elements in b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (b.half[i] == 0) ? 0 : (u16)a.half[i] / ((u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_div_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.w vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : (s32)a.word[i] / ((s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_div_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.wu vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 32-bit elements in a
by elements in b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (b.word[i] == 0) ? 0 : (u32)a.word[i] / ((u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_div_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.d vr, vr, vr
+CPU Flags: LSX
+
+Divide signed 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : (s64)a.dword[i] / ((s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_div_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.du vr, vr, vr
+CPU Flags: LSX
+
+Divide unsigned 64-bit elements in a
by elements in b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (b.dword[i] == 0) ? 0 : (u64)a.dword[i] / ((u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 8-bit elements in a
to even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 8-bit elements in a
to even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 16-bit elements in a
to even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 16-bit elements in a
to even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 32-bit elements in a
to even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 32-bit elements in a
to even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned signed 64-bit elements in a
to even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Add odd-positioned unsigned 64-bit elements in a
to even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 8-bit elements in a
by even-positioned signed 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 8-bit elements in a
by even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 16-bit elements in a
by even-positioned signed 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 16-bit elements in a
by even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 32-bit elements in a
by even-positioned signed 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 32-bit elements in a
by even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned signed 64-bit elements in a
by even-positioned signed 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract odd-positioned unsigned 64-bit elements in a
by even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, add to elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] =
+ (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] =
+ (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] =
+ (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and unsigned elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 8-bit elements in b
and signed elements in c
, add to 16-bit elements in a
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] =
+ (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and unsigned elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+ (u32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 16-bit elements in b
and signed elements in c
, add to 32-bit elements in a
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+ (s32)a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and unsigned elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+ (u64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 32-bit elements in b
and signed elements in c
, add to 64-bit elements in a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+ (s64)a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned signed 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and unsigned elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+ (u128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply odd-positioned unsigned 64-bit elements in b
and signed elements in c
, add to 128-bit elements in a
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+ (s128)a.qword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmax_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_bu (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_hu (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_wu (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmaxi_du (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise maximum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.b vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.h vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.w vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.d vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmin_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.du vr, vr, vr
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.b vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_bu (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.bu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 8-bit elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.h vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_hu (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.hu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 16-bit elements in a
and imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.w vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_wu (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.wu vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 32-bit elements in a
and imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.d vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for signed 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmini_du (__m128i a, imm_0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.du vr, vr, imm
+CPU Flags: LSX
+
+Compute elementwise minimum for unsigned 64-bit elements in a
and imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in b
and c
, negate and add elements in a
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] * b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] * b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmul_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Multiply even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] - b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] - b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsub_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.q vr, vr, vr
+CPU Flags: LSX
+
+Subtract 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] - b.qword[0];
+
+Tested on real machine.
+__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Subtract even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
for (int i = 0; i < 1; i++) {
+ dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+Tested on real machine.
+ +__m128i __lsx_vilvh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in higher half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in higher half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in higher half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in higher half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.b vr, vr, vr
+CPU Flags: LSX
+
+Interleave 8-bit elements in lower half of a
and b
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.h vr, vr, vr
+CPU Flags: LSX
+
+Interleave 16-bit elements in lower half of a
and b
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.w vr, vr, vr
+CPU Flags: LSX
+
+Interleave 32-bit elements in lower half of a
and b
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+
+Tested on real machine.
+__m128i __lsx_vilvl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.d vr, vr, vr
+CPU Flags: LSX
+
+Interleave 64-bit elements in lower half of a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+
+Tested on real machine.
+ +__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] & imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ANDN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vnor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vnor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vnori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise NOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+Tested on real machine.
+__m128i __lsx_vor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise OR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] | imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vorn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vorn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ORN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vxor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vxor.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vxori.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise XOR between elements in a
and imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] ^ imm;
+}
+
+Tested on real machine.
+ +__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+
+Read 128-bit data from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldx (void * addr, long int offset);
+#include <lsxintrin.h>
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+
+Read 128-bit data from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+
+Read 8-bit data from memory address addr + (offset << 0)
, replicate the data to all vector lanes and save into dst
.
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 16; i++) {
+ dst.byte[i] = data;
+}
+
+__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+
+Read 16-bit data from memory address addr + (offset << 1)
, replicate the data to all vector lanes and save into dst
.
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 8; i++) {
+ dst.half[i] = data;
+}
+
+__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+
+Read 32-bit data from memory address addr + (offset << 2)
, replicate the data to all vector lanes and save into dst
.
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 4; i++) {
+ dst.word[i] = data;
+}
+
+__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+
+Read 64-bit data from memory address addr + (offset << 3)
, replicate the data to all vector lanes and save into dst
.
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 2; i++) {
+ dst.dword[i] = data;
+}
+
+void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vst vr, r, imm
+CPU Flags: LSX
+
+Write 128-bit data in data
to memory address addr + offset
.
memory_store(128, data, addr + offset);
+
+void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.b vr, r, imm, imm
+CPU Flags: LSX
+
+Store the element in data
specified by lane
to memory address addr + offset
.
memory_store(8, data.byte[lane], addr + offset);
+
+void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.h vr, r, imm, imm
+CPU Flags: LSX
+
+Store the element in data
specified by lane
to memory address addr + offset
.
memory_store(16, data.half[lane], addr + offset);
+
+void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.w vr, r, imm, imm
+CPU Flags: LSX
+
+Store the element in data
specified by lane
to memory address addr + offset
.
memory_store(32, data.word[lane], addr + offset);
+
+void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.d vr, r, imm, imm
+CPU Flags: LSX
+
+Store the element in data
specified by lane
to memory address addr + offset
.
memory_store(64, data.dword[lane], addr + offset);
+
+
+ __m128i __lsx_insgr2vr_b (__m128i a, int b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.b vr, r, imm
+CPU Flags: LSX
+
+Insert 8-bit element into lane indexed imm
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = (i == imm) ? b : a.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_insgr2vr_h (__m128i a, int b, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.h vr, r, imm
+CPU Flags: LSX
+
+Insert 16-bit element into lane indexed imm
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = (i == imm) ? b : a.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_insgr2vr_w (__m128i a, int b, imm0_3 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.w vr, r, imm
+CPU Flags: LSX
+
+Insert 32-bit element into lane indexed imm
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_insgr2vr_d (__m128i a, long int b, imm0_1 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.d vr, r, imm
+CPU Flags: LSX
+
+Insert 64-bit element into lane indexed imm
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+Tested on real machine.
+ +__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+
+Permute words from a
and b
with indices recorded in imm
and store into dst
.
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+
+
+ __m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsll.v vr, vr, imm
+CPU Flags: LSX
+
+Compute 128-bit a
shifted left by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+
+__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsrl.v vr, vr, imm
+CPU Flags: LSX
+
+Compute 128-bit a
shifted right by imm * 8
bits.
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+
+
+ __m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+
+Shuffle bytes from a
and b
with indices from c
.
Caveat: the indices are placed in c
, while in other vshuf
intrinsics they are placed in a
.
for (int i = 0; i < 16; i++) {
+ if (c.byte[i] >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.byte[i] = 0;
+ } else if ((c.byte[i] % 32) < 16) {
+ dst.byte[i] = b.byte[c.byte[i] % 16];
+ } else {
+ dst.byte[i] = a.byte[c.byte[i] % 16];
+ }
+}
+
+__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 16-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.half[i] = 0;
+ } else if ((a.half[i] % 16) < 8) {
+ dst.half[i] = c.half[a.half[i] % 8];
+ } else {
+ dst.half[i] = b.half[a.half[i] % 8];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 32-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.word[i] = 0;
+ } else if ((a.word[i] % 8) < 4) {
+ dst.word[i] = c.word[a.word[i] % 4];
+ } else {
+ dst.word[i] = b.word[a.word[i] % 4];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 64-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.dword[i] = 0;
+ } else if ((a.dword[i] % 4) < 2) {
+ dst.dword[i] = c.dword[a.dword[i] % 2];
+ } else {
+ dst.dword[i] = b.dword[a.dword[i] % 2];
+ }
+}
+
+Tested on real machine.
+ +' + escapeHtml(summary) +'
' + noResultsText + '
'); + } +} + +function doSearch () { + var query = document.getElementById('mkdocs-search-query').value; + if (query.length > min_search_length) { + if (!window.Worker) { + displayResults(search(query)); + } else { + searchWorker.postMessage({query: query}); + } + } else { + // Clear results for short queries + displayResults([]); + } +} + +function initSearch () { + var search_input = document.getElementById('mkdocs-search-query'); + if (search_input) { + search_input.addEventListener("keyup", doSearch); + } + var term = getSearchTermFromLocation(); + if (term) { + search_input.value = term; + doSearch(); + } +} + +function onWorkerMessage (e) { + if (e.data.allowSearch) { + initSearch(); + } else if (e.data.results) { + var results = e.data.results; + displayResults(results); + } else if (e.data.config) { + min_search_length = e.data.config.min_search_length-1; + } +} + +if (!window.Worker) { + console.log('Web Worker API not supported'); + // load index in main thread + $.getScript(joinUrl(base_url, "search/worker.js")).done(function () { + console.log('Loaded worker'); + init(); + window.postMessage = function (msg) { + onWorkerMessage({data: msg}); + }; + }).fail(function (jqxhr, settings, exception) { + console.error('Could not load worker.js'); + }); +} else { + // Wrap search in a web worker + var searchWorker = new Worker(joinUrl(base_url, "search/worker.js")); + searchWorker.postMessage({init: true}); + searchWorker.onmessage = onWorkerMessage; +} diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..76494ef7 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Unofficial LoongArch Intrinsics Guide This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources: QEMU GCC Observations from real hardware incl. 3C5000 and 3A6000","title":"Unofficial LoongArch Intrinsics Guide"},{"location":"#unofficial-loongarch-intrinsics-guide","text":"This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources: QEMU GCC Observations from real hardware incl. 3C5000 and 3A6000","title":"Unofficial LoongArch Intrinsics Guide"},{"location":"lsx/bitwise_operations/","text":"Bitwise Operations __m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c) Synopsis __m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c) #include