From 527619871e144c4af17edf678705905efe2a7cc2 Mon Sep 17 00:00:00 2001 From: dpd Date: Tue, 5 Aug 2025 10:55:34 +0800 Subject: [PATCH] Adding explicit type casting for vector optimization in LoongArch architecture --- bin/activate-emsdk | 2 +- bin/fetch-gn | 2 +- src/core/SkBlurEngine.cpp | 54 +++++------ src/opts/SkRasterPipeline_opts.h | 148 +++++++++++++++---------------- 4 files changed, 104 insertions(+), 102 deletions(-) diff --git a/bin/activate-emsdk b/bin/activate-emsdk index b3c529f800cb..062657d4f0b9 100755 --- a/bin/activate-emsdk +++ b/bin/activate-emsdk @@ -18,7 +18,7 @@ EMSDK_PATH = os.path.join(EMSDK_ROOT, 'emsdk.py') EMSDK_VERSION = '4.0.7' def main(): - if sysconfig.get_platform() in ['linux-aarch64', 'linux-arm64']: + if sysconfig.get_platform() in ['linux-aarch64', 'linux-arm64', 'linux-loongarch64']: # This platform cannot install emsdk at the provided version. See # https://github.com/emscripten-core/emsdk/blob/main/emscripten-releases-tags.json#L5 # for the latest version diff --git a/bin/fetch-gn b/bin/fetch-gn index ebdc31c4496a..d348fde0db54 100755 --- a/bin/fetch-gn +++ b/bin/fetch-gn @@ -20,7 +20,7 @@ os.chdir(os.path.join(os.path.dirname(__file__), os.pardir)) gnzip = os.path.join(tempfile.mkdtemp(), 'gn.zip') with open(gnzip, 'wb') as f: OS = {'darwin': 'mac', 'linux': 'linux', 'linux2': 'linux', 'win32': 'windows'}[sys.platform] - cpu = {'aarch64': 'arm64', 'amd64': 'amd64', 'arm64': 'arm64', 'x86_64': 'amd64'}[platform.machine().lower()] + cpu = {'aarch64': 'arm64', 'amd64': 'amd64', 'arm64': 'arm64', 'x86_64': 'amd64', 'loongarch64': 'loongarch64'}[platform.machine().lower()] rev = 'b2afae122eeb6ce09c52d63f67dc53fc517dbdc8' url = 'https://chrome-infra-packages.appspot.com/dl/gn/gn/{}-{}/+/git_revision:{}'.format( diff --git a/src/core/SkBlurEngine.cpp b/src/core/SkBlurEngine.cpp index 989da9594822..58947af4b82a 100644 --- a/src/core/SkBlurEngine.cpp +++ b/src/core/SkBlurEngine.cpp @@ -556,62 +556,64 @@ class ThreeBoxApproxPass final : public Pass { skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor; skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor; skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor; - v4u32 sum0 = __lsx_vld(fSum0, 0); // same as skvx::Vec<4, uint32_t>::Load(fSum0); - v4u32 sum1 = __lsx_vld(fSum1, 0); - v4u32 sum2 = __lsx_vld(fSum2, 0); + v4u32 sum0 = (v4u32)__lsx_vld(fSum0, 0); // same as skvx::Vec<4, uint32_t>::Load(fSum0); + v4u32 sum1 = (v4u32)__lsx_vld(fSum1, 0); + v4u32 sum2 = (v4u32)__lsx_vld(fSum2, 0); auto processValue = [&](v4u32& vLeadingEdge){ sum0 += vLeadingEdge; sum1 += sum0; sum2 += sum1; - v4u32 divisorFactor = __lsx_vreplgr2vr_w(fDivider.divisorFactor()); - v4u32 blurred = __lsx_vmuh_w(divisorFactor, sum2); + v4u32 divisorFactor = (v4u32)__lsx_vreplgr2vr_w(fDivider.divisorFactor()); + v4u32 blurred = (v4u32)__lsx_vmuh_w((__m128i)divisorFactor, (__m128i)sum2); - v4u32 buffer2Value = __lsx_vld(buffer2Cursor, 0); //Not fBuffer0Cursor, out of bounds. + v4u32 buffer2Value = (v4u32)__lsx_vld(buffer2Cursor, 0); //Not fBuffer0Cursor, out of bounds. sum2 -= buffer2Value; - __lsx_vst(sum1, (void *)buffer2Cursor, 0); + __lsx_vst((__m128i)sum1, (void *)buffer2Cursor, 0); buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2; - v4u32 buffer1Value = __lsx_vld(buffer1Cursor, 0); + v4u32 buffer1Value = (v4u32)__lsx_vld(buffer1Cursor, 0); sum1 -= buffer1Value; - __lsx_vst(sum0, (void *)buffer1Cursor, 0); + __lsx_vst((__m128i)sum0, (void *)buffer1Cursor, 0); buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1; - v4u32 buffer0Value = __lsx_vld(buffer0Cursor, 0); + v4u32 buffer0Value = (v4u32)__lsx_vld(buffer0Cursor, 0); sum0 -= buffer0Value; - __lsx_vst(vLeadingEdge, (void *)buffer0Cursor, 0); + __lsx_vst((__m128i)vLeadingEdge, (void *)buffer0Cursor, 0); buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0; v16u8 shuf = {0x0,0x4,0x8,0xc,0x0}; - v16u8 ret = __lsx_vshuf_b(blurred, blurred, shuf); + v16u8 ret = (v16u8)__lsx_vshuf_b((__m128i)blurred, (__m128i)blurred, (__m128i)shuf); return ret; }; - v4u32 zero = __lsx_vldi(0x0); + v4u32 zero = (v4u32)__lsx_vldi(0x0); if (!src32 && !dst32) { while (n --> 0) { (void)processValue(zero); } } else if (src32 && !dst32) { while (n --> 0) { - v4u32 edge = __lsx_vinsgr2vr_w(zero, *src32, 0); - edge = __lsx_vilvl_b(zero, edge); - edge = __lsx_vilvl_h(zero, edge); + v4u32 edge = (v4u32)__lsx_vinsgr2vr_w((__m128i)zero, *src32, 0); + edge = (v4u32)__lsx_vilvl_b((__m128i)zero, (__m128i)edge); + edge = (v4u32)__lsx_vilvl_h((__m128i)zero, (__m128i)edge); (void)processValue(edge); src32 += srcStride; } } else if (!src32 && dst32) { while (n --> 0) { - v4u32 ret = processValue(zero); - __lsx_vstelm_w(ret, dst32, 0, 0); // 3rd is offset, 4th is idx. + v16u8 ret_vec = processValue(zero); + v4u32 ret = (v4u32)ret_vec; + __lsx_vstelm_w((__m128i)ret, dst32, 0, 0); // 3rd is offset, 4th is idx. dst32 += dstStride; } } else if (src32 && dst32) { while (n --> 0) { - v4u32 edge = __lsx_vinsgr2vr_w(zero, *src32, 0); - edge = __lsx_vilvl_b(zero, edge); - edge = __lsx_vilvl_h(zero, edge); - v4u32 ret = processValue(edge); - __lsx_vstelm_w(ret, dst32, 0, 0); + v4u32 edge = (v4u32)__lsx_vinsgr2vr_w((__m128i)zero, *src32, 0); + edge = (v4u32)__lsx_vilvl_b((__m128i)zero, (__m128i)edge); + edge = (v4u32)__lsx_vilvl_h((__m128i)zero, (__m128i)edge); + v16u8 ret_vec = processValue(edge); + v4u32 ret = (v4u32)ret_vec; + __lsx_vstelm_w((__m128i)ret, dst32, 0, 0); src32 += srcStride; dst32 += dstStride; } @@ -622,9 +624,9 @@ class ThreeBoxApproxPass final : public Pass { fBuffer1Cursor = buffer1Cursor; fBuffer2Cursor = buffer2Cursor; - __lsx_vst(sum0, fSum0, 0); - __lsx_vst(sum1, fSum1, 0); - __lsx_vst(sum2, fSum2, 0); + __lsx_vst((__m128i)sum0, fSum0, 0); + __lsx_vst((__m128i)sum1, fSum1, 0); + __lsx_vst((__m128i)sum2, fSum2, 0); #else skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor; skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor; diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 19a842212855..f8463f087c77 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -960,14 +960,14 @@ namespace SK_OPTS_NS { SI F min(F a, F b) { return __lasx_xvfmin_s(a,b); } SI F max(F a, F b) { return __lasx_xvfmax_s(a,b); } - SI I32 min(I32 a, I32 b) { return __lasx_xvmin_w(a,b); } - SI U32 min(U32 a, U32 b) { return __lasx_xvmin_wu(a,b); } - SI I32 max(I32 a, I32 b) { return __lasx_xvmax_w(a,b); } - SI U32 max(U32 a, U32 b) { return __lasx_xvmax_wu(a,b); } + SI I32 min(I32 a, I32 b) { return (I32)__lasx_xvmin_w((__m256i)a,(__m256i)b); } + SI U32 min(U32 a, U32 b) { return (U32)__lasx_xvmin_wu((__m256i)a,(__m256i)b); } + SI I32 max(I32 a, I32 b) { return (I32)__lasx_xvmax_w((__m256i)a,(__m256i)b); } + SI U32 max(U32 a, U32 b) { return (U32)__lasx_xvmax_wu((__m256i)a,(__m256i)b); } SI F mad(F f, F m, F a) { return __lasx_xvfmadd_s(f, m, a); } SI F nmad(F f, F m, F a) { return __lasx_xvfmadd_s(-f, m, a); } - SI F abs_ (F v) { return (F)__lasx_xvand_v((I32)v, (I32)(0-v)); } + SI F abs_ (F v) { return (F)__lasx_xvand_v((__m256i)v, (__m256i)(0-v)); } SI I32 abs_(I32 v) { return max(v, -v); } SI F rcp_approx(F v) { return __lasx_xvfrecip_s(v); } SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, F() + 2.0f); } @@ -976,17 +976,17 @@ namespace SK_OPTS_NS { SI U32 iround(F v) { F t = F() + 0.5f; - return __lasx_xvftintrz_w_s(v + t); + return (U32)__lasx_xvftintrz_w_s(v + t); } SI U32 round(F v) { F t = F() + 0.5f; - return __lasx_xvftintrz_w_s(v + t); + return (U32)__lasx_xvftintrz_w_s(v + t); } SI U16 pack(U32 v) { - return __lsx_vpickev_h(__lsx_vsat_wu(emulate_lasx_d_xr2vr_h(v), 15), - __lsx_vsat_wu(emulate_lasx_d_xr2vr_l(v), 15)); + return (U16)__lsx_vpickev_h(__lsx_vsat_wu(emulate_lasx_d_xr2vr_h((__m256i)v), 15), + __lsx_vsat_wu(emulate_lasx_d_xr2vr_l((__m256i)v), 15)); } SI U8 pack(U16 v) { @@ -996,12 +996,12 @@ namespace SK_OPTS_NS { } SI bool any(I32 c){ - v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c)); + v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), (__m256i)c)); return (retv[0] | retv[4]) != 0b0000; } SI bool all(I32 c){ - v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c)); + v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), (__m256i)c)); return (retv[0] & retv[4]) == 0b1111; } @@ -1038,16 +1038,16 @@ namespace SK_OPTS_NS { } SI void load2(const uint16_t* ptr, U16* r, U16* g) { - U16 _0123 = __lsx_vld(ptr, 0), - _4567 = __lsx_vld(ptr, 16); - *r = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_4567, 16), 16), 15), + U16 _0123 = (U16)__lsx_vld(ptr, 0), + _4567 = (U16)__lsx_vld(ptr, 16); + *r = (U16)__lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_4567, 16), 16), 15), __lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_0123, 16), 16), 15)); - *g = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(_4567, 16), 15), + *g = (U16)__lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(_4567, 16), 15), __lsx_vsat_w(__lsx_vsrai_w(_0123, 16), 15)); } SI void store2(uint16_t* ptr, U16 r, U16 g) { - auto _0123 = __lsx_vilvl_h(g, r), - _4567 = __lsx_vilvh_h(g, r); + auto _0123 = __lsx_vilvl_h((__m128i)g, (__m128i)r), + _4567 = __lsx_vilvh_h((__m128i)g, (__m128i)r); __lsx_vst(_0123, ptr, 0); __lsx_vst(_4567, ptr, 16); } @@ -1068,17 +1068,17 @@ namespace SK_OPTS_NS { rg4567 = __lsx_vilvl_h(_57, _46), ba4567 = __lsx_vilvh_h(_57, _46); - *r = __lsx_vilvl_d(rg4567, rg0123); - *g = __lsx_vilvh_d(rg4567, rg0123); - *b = __lsx_vilvl_d(ba4567, ba0123); - *a = __lsx_vilvh_d(ba4567, ba0123); + *r = (U16)__lsx_vilvl_d((__m128i)rg4567, (__m128i)rg0123); + *g = (U16)__lsx_vilvh_d((__m128i)rg4567, (__m128i)rg0123); + *b = (U16)__lsx_vilvl_d((__m128i)ba4567, (__m128i)ba0123); + *a = (U16)__lsx_vilvh_d((__m128i)ba4567, (__m128i)ba0123); } SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) { - auto rg0123 = __lsx_vilvl_h(g, r), // r0 g0 r1 g1 r2 g2 r3 g3 - rg4567 = __lsx_vilvh_h(g, r), // r4 g4 r5 g5 r6 g6 r7 g7 - ba0123 = __lsx_vilvl_h(a, b), - ba4567 = __lsx_vilvh_h(a, b); + auto rg0123 = __lsx_vilvl_h((__m128i)g, (__m128i)r), // r0 g0 r1 g1 r2 g2 r3 g3 + rg4567 = __lsx_vilvh_h((__m128i)g, (__m128i)r), // r4 g4 r5 g5 r6 g6 r7 g7 + ba0123 = __lsx_vilvl_h((__m128i)a, (__m128i)b), + ba4567 = __lsx_vilvh_h((__m128i)a, (__m128i)b); auto _01 =__lsx_vilvl_w(ba0123, rg0123), _23 =__lsx_vilvh_w(ba0123, rg0123), @@ -1163,14 +1163,14 @@ namespace SK_OPTS_NS { SI F min(F a, F b) { return __lsx_vfmin_s(a,b); } SI F max(F a, F b) { return __lsx_vfmax_s(a,b); } - SI I32 min(I32 a, I32 b) { return __lsx_vmin_w(a,b); } - SI U32 min(U32 a, U32 b) { return __lsx_vmin_wu(a,b); } - SI I32 max(I32 a, I32 b) { return __lsx_vmax_w(a,b); } - SI U32 max(U32 a, U32 b) { return __lsx_vmax_wu(a,b); } + SI I32 min(I32 a, I32 b) { return (I32)__lsx_vmin_w((__m128i)a,(__m128i)b); } + SI U32 min(U32 a, U32 b) { return (U32)__lsx_vmin_wu((__m128i)a,(__m128i)b); } + SI I32 max(I32 a, I32 b) { return (I32)__lsx_vmax_w((__m128i)a,(__m128i)b); } + SI U32 max(U32 a, U32 b) { return (U32)__lsx_vmax_wu((__m128i)a,(__m128i)b); } SI F mad(F f, F m, F a) { return __lsx_vfmadd_s(f, m, a); } SI F nmad(F f, F m, F a) { return __lsx_vfmadd_s(-f, m, a); } - SI F abs_(F v) { return (F)__lsx_vand_v((I32)v, (I32)(0-v)); } + SI F abs_(F v) { return (F)__lsx_vand_v((__m128i)v, (__m128i)(0-v)); } SI I32 abs_(I32 v) { return max(v, -v); } SI F rcp_approx (F v) { return __lsx_vfrecip_s(v); } SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, F() + 2.0f); } @@ -1179,11 +1179,11 @@ namespace SK_OPTS_NS { SI U32 iround(F v) { F t = F() + 0.5f; - return __lsx_vftintrz_w_s(v + t); } + return (U32)__lsx_vftintrz_w_s(v + t); } SI U32 round(F v) { F t = F() + 0.5f; - return __lsx_vftintrz_w_s(v + t); } + return (U32)__lsx_vftintrz_w_s(v + t); } SI U16 pack(U32 v) { __m128i tmp = __lsx_vsat_wu(v, 15); @@ -1199,12 +1199,12 @@ namespace SK_OPTS_NS { } SI bool any(I32 c){ - v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c)); + v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), (__m128i)c)); return retv[0] != 0b0000; } SI bool all(I32 c){ - v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c)); + v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), (__m128i)c)); return retv[0] == 0b1111; } @@ -1255,7 +1255,7 @@ namespace SK_OPTS_NS { } SI void store2(uint16_t* ptr, U16 r, U16 g) { - U32 rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r)); + __m128i rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r)); __lsx_vst(rg, ptr, 0); } @@ -3458,26 +3458,26 @@ SI void gradient_lookup(const SkRasterPipelineContexts::GradientCtx* c, U32 idx, } else #elif defined(SKRP_CPU_LASX) if (c->stopCount <= 8) { - fr = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[0], 0), idx); - br = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[0], 0), idx); - fg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[1], 0), idx); - bg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[1], 0), idx); - fb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[2], 0), idx); - bb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[2], 0), idx); - fa = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[3], 0), idx); - ba = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[3], 0), idx); + fr = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[0], 0), (__m256i)idx); + br = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[0], 0), (__m256i)idx); + fg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[1], 0), (__m256i)idx); + bg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[1], 0), (__m256i)idx); + fb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[2], 0), (__m256i)idx); + bb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[2], 0), (__m256i)idx); + fa = (__m256)__lasx_xvperm_w(__lasx_xvld(c->factors[3], 0), (__m256i)idx); + ba = (__m256)__lasx_xvperm_w(__lasx_xvld(c->biases[3], 0), (__m256i)idx); } else #elif defined(SKRP_CPU_LSX) if (c->stopCount <= 4) { __m128i zero = __lsx_vldi(0); - fr = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->factors[0], 0)); - br = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->biases[0], 0)); - fg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->factors[1], 0)); - bg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->biases[1], 0)); - fb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->factors[2], 0)); - bb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->biases[2], 0)); - fa = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->factors[3], 0)); - ba = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->biases[3], 0)); + fr = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->factors[0], 0)); + br = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->biases[0], 0)); + fg = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->factors[1], 0)); + bg = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->biases[1], 0)); + fb = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->factors[2], 0)); + bb = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->biases[2], 0)); + fa = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->factors[3], 0)); + ba = (__m128)__lsx_vshuf_w((__m128i)idx, zero, __lsx_vld(c->biases[3], 0)); } else #endif { @@ -5625,11 +5625,11 @@ SI I16 scaled_mult(I16 a, I16 b) { #elif defined(SKRP_CPU_NEON) return vqrdmulhq_s16(a, b); #elif defined(SKRP_CPU_LASX) - I16 res = __lasx_xvmuh_h(a, b); - return __lasx_xvslli_h(res, 1); + __m256i res = __lasx_xvmuh_h((__m256i)a, (__m256i)b); + return (I16)__lasx_xvslli_h(res, 1); #elif defined(SKRP_CPU_LSX) - I16 res = __lsx_vmuh_h(a, b); - return __lsx_vslli_h(res, 1); + __m128i res = __lsx_vmuh_h((__m128i)a, (__m128i)b); + return (I16)__lsx_vslli_h(res, 1); #else const I32 roundingTerm = I32_(1 << 14); return cast((cast(a) * cast(b) + roundingTerm) >> 15); @@ -6055,7 +6055,7 @@ SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { split(v, &_02,&_13); __m256i tmp0 = __lasx_xvsat_wu(_02, 15); __m256i tmp1 = __lasx_xvsat_wu(_13, 15); - return __lasx_xvpickev_h(tmp1, tmp0); + return (U16)__lasx_xvpickev_h(tmp1, tmp0); }; #elif defined(SKRP_CPU_LSX) __m128i _01, _23, rg, ba; @@ -6065,10 +6065,10 @@ SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { __m128i mask_00ff = __lsx_vreplgr2vr_h(0xff); - *r = __lsx_vand_v(rg, mask_00ff); - *g = __lsx_vsrli_h(rg, 8); - *b = __lsx_vand_v(ba, mask_00ff); - *a = __lsx_vsrli_h(ba, 8); + *r = (U16)__lsx_vand_v(rg, mask_00ff); + *g = (U16)__lsx_vsrli_h(rg, 8); + *b = (U16)__lsx_vand_v(ba, mask_00ff); + *a = (U16)__lsx_vsrli_h(ba, 8); #else auto cast_U16 = [](U32 v) -> U16 { return cast(v); @@ -6096,26 +6096,26 @@ SI void load_8888_(const uint32_t* ptr, U16* r, U16* g, U16* b, U16* a) { SI void store_8888_(uint32_t* ptr, U16 r, U16 g, U16 b, U16 a) { #if defined(SKRP_CPU_LSX) __m128i mask = __lsx_vreplgr2vr_h(255); - r = __lsx_vmin_hu(r, mask); - g = __lsx_vmin_hu(g, mask); - b = __lsx_vmin_hu(b, mask); - a = __lsx_vmin_hu(a, mask); + r = (U16)__lsx_vmin_hu((__m128i)r, mask); + g = (U16)__lsx_vmin_hu((__m128i)g, mask); + b = (U16)__lsx_vmin_hu((__m128i)b, mask); + a = (U16)__lsx_vmin_hu((__m128i)a, mask); - g = __lsx_vslli_h(g, 8); + g = (U16)__lsx_vslli_h((__m128i)g, 8); r = r | g; - a = __lsx_vslli_h(a, 8); + a = (U16)__lsx_vslli_h((__m128i)a, 8); a = a | b; - __m128i r_lo = __lsx_vsllwil_wu_hu(r, 0); - __m128i r_hi = __lsx_vexth_wu_hu(r); - __m128i a_lo = __lsx_vsllwil_wu_hu(a, 0); - __m128i a_hi = __lsx_vexth_wu_hu(a); + __m128i r_lo = __lsx_vsllwil_wu_hu((__m128i)r, 0); + __m128i r_hi = __lsx_vexth_wu_hu((__m128i)r); + __m128i a_lo = __lsx_vsllwil_wu_hu((__m128i)a, 0); + __m128i a_hi = __lsx_vexth_wu_hu((__m128i)a); a_lo = __lsx_vslli_w(a_lo, 16); a_hi = __lsx_vslli_w(a_hi, 16); - r = r_lo | a_lo; - a = r_hi | a_hi; + r = (U16)(r_lo | a_lo); + a = (U16)(r_hi | a_hi); store(ptr, join(r, a)); #else r = min(r, 255); @@ -6685,8 +6685,8 @@ LOWP_STAGE_GP(bilerp_clamp_8888, const SkRasterPipelineContexts::GatherCtx* ctx) qy_lo = __lsx_vxor_v(qy_lo, temp); qy_hi = __lsx_vxor_v(qy_hi, temp); - I16 tx = __lsx_vpickev_h(qx_hi, qx_lo); - I16 ty = __lsx_vpickev_h(qy_hi, qy_lo); + I16 tx = (I16)__lsx_vpickev_h(qx_hi, qx_lo); + I16 ty = (I16)__lsx_vpickev_h(qy_hi, qy_lo); #else I16 tx = cast(qx ^ 0x8000), ty = cast(qy ^ 0x8000);