404
+ +Page not found
+ + +diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..809adafd --- /dev/null +++ b/404.html @@ -0,0 +1,137 @@ + + +
+ + + + +Page not found
+ + +This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:
+__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vbitsel.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise selection: for each bit position, if the bit in c
equals to one, copy the bit from b
to dst
, otherwise copy from a
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+
+ __m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+
+Compute bitwise AND between 8-bit elements in a
and imm
.
for (int i = 0;i < 16;i++) {
+ dst.byte[i] = a.byte[i] & imm;
+}
+
+__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise ANDN between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+
+Compute bitwise AND between elements in a
and b
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+
+ __m128i __lsx_vfcmp_cond_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cond.s vr, vr
+CPU Flags: LSX
+
+cond
can be one of:
caf
: Quiet Always Falseceq
: Quiet Equalcle
: Quiet Less than or Equalclt
: Quiet Less thancne
: Quiet Not Equalcor
: Quiet Orderedcueq
: Quiet Unordered Equalcule
: Quiet Unordered Less than or Equalcult
: Quiet Unordered Less thancun
: Quiet Unorderedcune
: Quiet Unordered Not Equalsaf
: Signaling Always Falseseq
: Signaling Equalsle
: Signaling Less than or Equalslt
: Signaling Less thansne
: Signaling Not Equalsor
: Signaling Orderedsueq
: Signaling Unordered Equalsule
: Signaling Unordered Less than or Equalsult
: Signaling Unordered Less thansun
: Signaling Unorderedsune
: Signaling Unordered Not EqualCompare single precision floating point elements from a
and b
, save the comparison result into dst
.
for (int i = 0;i < 4;i++) {
+ if (fp_compare(cond, a.fp32[i], b.fp32[i])) {
+ dst.word[i] = 0xFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+__m128i __lsx_vfcmp_cond_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cond.d vr, vr
+CPU Flags: LSX
+
+cond
can be one of:
caf
: Quiet Always Falseceq
: Quiet Equalcle
: Quiet Less than or Equalclt
: Quiet Less thancne
: Quiet Not Equalcor
: Quiet Orderedcueq
: Quiet Unordered Equalcule
: Quiet Unordered Less than or Equalcult
: Quiet Unordered Less thancun
: Quiet Unorderedcune
: Quiet Unordered Not Equalsaf
: Signaling Always Falseseq
: Signaling Equalsle
: Signaling Less than or Equalslt
: Signaling Less thansne
: Signaling Not Equalsor
: Signaling Orderedsueq
: Signaling Unordered Equalsule
: Signaling Unordered Less than or Equalsult
: Signaling Unordered Less thansun
: Signaling Unorderedsune
: Signaling Unordered Not EqualCompare double precision floating point elements from a
and b
, save the comparison result into dst
.
for (int i = 0;i < 2;i++) {
+ if (fp_compare(cond, a.fp64[i], b.fp64[i])) {
+ dst.word[i] = 0xFFFFFFFFFFFFFFFF;
+ } else {
+ dst.word[i] = 0;
+ }
+}
+
+
+ __m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmadd.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmsub.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, accumulate to elements in c
and store the negated result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.s vr, vr, vr
+CPU Flags: LSX
+
+Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.d vr, vr, vr
+CPU Flags: LSX
+
+Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a
and b
, subtract elements in c
and store the negated result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+
+ __m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+
+Add 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+
+Add 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + b.half[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+
+Add 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + b.word[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+
+Add 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+Tested on real machine.
+__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+
+Add 128-bit elements in a
and b
, save the result in dst
.
dst.qword[0] = a.qword[0] + b.qword[0];
+
+Tested on real machine.
+__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i]) : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i]) : (b.byte[i] - a.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i]) : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i]) : (b.half[i] - a.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i]) : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i]) : (b.word[i] - a.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i]) ? (a.dword[i] - b.dword[i]) : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+
+Compute absolute difference of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i]) ? (a.dword[i] - b.dword[i]) : (b.dword[i] - a.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+
+Add absolute of 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+
+Add 8-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = a.byte[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+
+Add 16-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = a.half[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+
+Add 32-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = a.word[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+
+Add 64-bit elements in a
and imm
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = a.dword[i] + imm;
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0;i < 8;i++) {
+ dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and unsigned elements in b
, save the 16-bit result in dst
.
for (int i = 0;i < 8;i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 8-bit elements in a
and signed elements in b
, save the 16-bit result in dst
.
for (int i = 0;i < 8;i++) {
+ dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and unsigned elements in b
, save the 32-bit result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 16-bit elements in a
and signed elements in b
, save the 32-bit result in dst
.
for (int i = 0;i < 4;i++) {
+ dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and unsigned elements in b
, save the 64-bit result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 32-bit elements in a
and signed elements in b
, save the 64-bit result in dst
.
for (int i = 0;i < 2;i++) {
+ dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned signed 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
dst.qword[0] = (s128)(s64)a.dword[0] + (s128)(s64)b.dword[0];
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and unsigned elements in b
, save the 128-bit result in dst
.
dst.qword[0] = (u128)(u64)a.dword[0] + (u128)(u64)b.dword[0];
+
+Tested on real machine.
+__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+Add even-positioned unsigned 64-bit elements in a
and signed elements in b
, save the 128-bit result in dst
.
dst.qword[0] = (u128)(u64)a.dword[0] + (s128)(s64)b.dword[0];
+
+Tested on real machine.
+__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of signed 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) + (a.byte[i] & b.byte[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of unsigned 8-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 16; i++) {
+ dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) + (a.byte[i] & b.byte[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of signed 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) + (a.half[i] & b.half[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of unsigned 16-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 8; i++) {
+ dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) + (a.half[i] & b.half[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of signed 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) + (a.word[i] & b.word[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of unsigned 32-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 4; i++) {
+ dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) + (a.word[i] & b.word[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of signed 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) + (a.dword[i] & b.dword[i] & 1);
+}
+
+Tested on real machine.
+__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+
+Compute the average of unsigned 64-bit elements in a
and b
, save the result in dst
.
for (int i = 0; i < 2; i++) {
+ dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) + (a.dword[i] & b.dword[i] & 1);
+}
+
+Tested on real machine.
+ +__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+
+Read 128-bit data from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldx (void * addr, long int offset);
+#include <lsxintrin.h>
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+
+Read 128-bit data from memory address addr + offset
, save the data into dst
.
dst = memory_load(128, addr + offset);
+
+__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+
+Read 8-bit data from memory address addr + (offset << 0)
, replicate the data to all vector lanes and save into dst
.
u8 data = memory_load(8, addr + offset);
+for (int i = 0;i < 16;i++) {
+ dst.byte[i] = data;
+}
+
+__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+
+Read 16-bit data from memory address addr + (offset << 1)
, replicate the data to all vector lanes and save into dst
.
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0;i < 8;i++) {
+ dst.half[i] = data;
+}
+
+__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+
+Read 32-bit data from memory address addr + (offset << 2)
, replicate the data to all vector lanes and save into dst
.
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0;i < 4;i++) {
+ dst.word[i] = data;
+}
+
+__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+
+Read 64-bit data from memory address addr + (offset << 3)
, replicate the data to all vector lanes and save into dst
.
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0;i < 2;i++) {
+ dst.dword[i] = data;
+}
+
+
+ __m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+
+Permute words from a
and b
with indices recorded in imm
and store into dst
.
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+
+
+ __m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+
+Shuffle bytes from a
and b
with indices from c
.
Caveat: the indices are placed in c
, while in other vshuf
intrinsics they are placed in a
.
for (int i = 0; i < 16; i++) {
+ if (c.byte[i] >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.byte[i] = 0;
+ } else if ((c.byte[i] % 32) < 16) {
+ dst.byte[i] = b.byte[c.byte[i] % 16];
+ } else {
+ dst.byte[i] = a.byte[c.byte[i] % 16];
+ }
+}
+
+__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 16-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 8; i++) {
+ if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.half[i] = 0;
+ } else if ((a.half[i] % 16) < 8) {
+ dst.half[i] = c.half[a.half[i] % 8];
+ } else {
+ dst.half[i] = b.half[a.half[i] % 8];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 32-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 4; i++) {
+ if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.word[i] = 0;
+ } else if ((a.word[i] % 8) < 4) {
+ dst.word[i] = c.word[a.word[i] % 4];
+ } else {
+ dst.word[i] = b.word[a.word[i] % 4];
+ }
+}
+
+Tested on real machine.
+__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+
+Shuffle 64-bit elements in b
and c
with indices from a
, save the result to dst
.
for (int i = 0; i < 2; i++) {
+ if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+ // Caveat: observed in 3C5000
+ dst.dword[i] = 0;
+ } else if ((a.dword[i] % 4) < 2) {
+ dst.dword[i] = c.dword[a.dword[i] % 2];
+ } else {
+ dst.dword[i] = b.dword[a.dword[i] % 2];
+ }
+}
+
+Tested on real machine.
+ +' + escapeHtml(summary) +'
' + noResultsText + '
'); + } +} + +function doSearch () { + var query = document.getElementById('mkdocs-search-query').value; + if (query.length > min_search_length) { + if (!window.Worker) { + displayResults(search(query)); + } else { + searchWorker.postMessage({query: query}); + } + } else { + // Clear results for short queries + displayResults([]); + } +} + +function initSearch () { + var search_input = document.getElementById('mkdocs-search-query'); + if (search_input) { + search_input.addEventListener("keyup", doSearch); + } + var term = getSearchTermFromLocation(); + if (term) { + search_input.value = term; + doSearch(); + } +} + +function onWorkerMessage (e) { + if (e.data.allowSearch) { + initSearch(); + } else if (e.data.results) { + var results = e.data.results; + displayResults(results); + } else if (e.data.config) { + min_search_length = e.data.config.min_search_length-1; + } +} + +if (!window.Worker) { + console.log('Web Worker API not supported'); + // load index in main thread + $.getScript(joinUrl(base_url, "search/worker.js")).done(function () { + console.log('Loaded worker'); + init(); + window.postMessage = function (msg) { + onWorkerMessage({data: msg}); + }; + }).fail(function (jqxhr, settings, exception) { + console.error('Could not load worker.js'); + }); +} else { + // Wrap search in a web worker + var searchWorker = new Worker(joinUrl(base_url, "search/worker.js")); + searchWorker.postMessage({init: true}); + searchWorker.onmessage = onWorkerMessage; +} diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..bb07fc3f --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Unofficial LoongArch Intrinsics Guide This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources: QEMU GCC Observations from real hardware incl. 3C5000 and 3A6000","title":"Unofficial LoongArch Intrinsics Guide"},{"location":"#unofficial-loongarch-intrinsics-guide","text":"This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources: QEMU GCC Observations from real hardware incl. 3C5000 and 3A6000","title":"Unofficial LoongArch Intrinsics Guide"},{"location":"lsx_bitops/vbitsel/","text":"Bitwise Selection __m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c) Synopsis __m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c) #include