diff --git a/src/hexagon.rs b/src/hexagon.rs new file mode 100644 index 00000000..91cf91c3 --- /dev/null +++ b/src/hexagon.rs @@ -0,0 +1,55 @@ +#![cfg(not(feature = "no-asm"))] + +use core::arch::global_asm; + +global_asm!(include_str!("hexagon/func_macro.s"), options(raw)); + +global_asm!(include_str!("hexagon/dfaddsub.s"), options(raw)); + +global_asm!(include_str!("hexagon/dfdiv.s"), options(raw)); + +global_asm!(include_str!("hexagon/dffma.s"), options(raw)); + +global_asm!(include_str!("hexagon/dfminmax.s"), options(raw)); + +global_asm!(include_str!("hexagon/dfmul.s"), options(raw)); + +global_asm!(include_str!("hexagon/dfsqrt.s"), options(raw)); + +global_asm!(include_str!("hexagon/divdi3.s"), options(raw)); + +global_asm!(include_str!("hexagon/divsi3.s"), options(raw)); + +global_asm!(include_str!("hexagon/fastmath2_dlib_asm.s"), options(raw)); + +global_asm!(include_str!("hexagon/fastmath2_ldlib_asm.s"), options(raw)); + +global_asm!( + include_str!("hexagon/memcpy_forward_vp4cp4n2.s"), + options(raw) +); + +global_asm!( + include_str!("hexagon/memcpy_likely_aligned.s"), + options(raw) +); + +global_asm!(include_str!("hexagon/moddi3.s"), options(raw)); + +global_asm!(include_str!("hexagon/modsi3.s"), options(raw)); + +global_asm!(include_str!("hexagon/sfdiv_opt.s"), options(raw)); + +global_asm!(include_str!("hexagon/sfsqrt_opt.s"), options(raw)); + +global_asm!(include_str!("hexagon/udivdi3.s"), options(raw)); + +global_asm!(include_str!("hexagon/udivmoddi4.s"), options(raw)); + +global_asm!(include_str!("hexagon/udivmodsi4.s"), options(raw)); + +global_asm!(include_str!("hexagon/udivsi3.s"), options(raw)); + +global_asm!(include_str!("hexagon/umoddi3.s"), options(raw)); + +global_asm!(include_str!("hexagon/umodsi3.s"), options(raw)); diff --git a/src/hexagon/dfaddsub.s b/src/hexagon/dfaddsub.s new file mode 100644 index 00000000..1f59e460 --- /dev/null +++ b/src/hexagon/dfaddsub.s @@ -0,0 +1,321 @@ + .text + .global __hexagon_adddf3 + .global __hexagon_subdf3 + .type __hexagon_adddf3, @function + .type __hexagon_subdf3, @function + +.global __qdsp_adddf3 ; .set __qdsp_adddf3, __hexagon_adddf3 +.global __hexagon_fast_adddf3 ; .set __hexagon_fast_adddf3, __hexagon_adddf3 +.global __hexagon_fast2_adddf3 ; .set __hexagon_fast2_adddf3, __hexagon_adddf3 +.global __qdsp_subdf3 ; .set __qdsp_subdf3, __hexagon_subdf3 +.global __hexagon_fast_subdf3 ; .set __hexagon_fast_subdf3, __hexagon_subdf3 +.global __hexagon_fast2_subdf3 ; .set __hexagon_fast2_subdf3, __hexagon_subdf3 + + .p2align 5 +__hexagon_adddf3: + { + r4 = extractu(r1,#11,#20) + r5 = extractu(r3,#11,#20) + r13:12 = combine(##0x20000000,#0) + } + { + p3 = dfclass(r1:0,#2) + p3 = dfclass(r3:2,#2) + r9:8 = r13:12 + p2 = cmp.gtu(r5,r4) + } + { + if (!p3) jump .Ladd_abnormal + if (p2) r1:0 = r3:2 + if (p2) r3:2 = r1:0 + if (p2) r5:4 = combine(r4,r5) + } + { + r13:12 = insert(r1:0,#52,#11 -2) + r9:8 = insert(r3:2,#52,#11 -2) + r15 = sub(r4,r5) + r7:6 = combine(#62,#1) + } + + + + + +.Ladd_continue: + { + r15 = min(r15,r7) + + r11:10 = neg(r13:12) + p2 = cmp.gt(r1,#-1) + r14 = #0 + } + { + if (!p2) r13:12 = r11:10 + r11:10 = extractu(r9:8,r15:14) + r9:8 = ASR(r9:8,r15) + + + + + r15:14 = #0 + } + { + p1 = cmp.eq(r11:10,r15:14) + if (!p1.new) r8 = or(r8,r6) + r5 = add(r4,#-1024 -60) + p3 = cmp.gt(r3,#-1) + } + { + r13:12 = add(r13:12,r9:8) + r11:10 = sub(r13:12,r9:8) + r7:6 = combine(#54,##2045) + } + { + p0 = cmp.gtu(r4,r7) + p0 = !cmp.gtu(r4,r6) + if (!p0.new) jump:nt .Ladd_ovf_unf + if (!p3) r13:12 = r11:10 + } + { + r1:0 = convert_d2df(r13:12) + p0 = cmp.eq(r13,#0) + p0 = cmp.eq(r12,#0) + if (p0.new) jump:nt .Ladd_zero + } + { + r1 += asl(r5,#20) + jumpr r31 + } + .falign +__hexagon_subdf3: + { + r3 = togglebit(r3,#31) + jump __qdsp_adddf3 + } + + + .falign +.Ladd_zero: + + + { + r28 = USR + r1:0 = #0 + r3 = #1 + } + { + r28 = extractu(r28,#2,#22) + r3 = asl(r3,#31) + } + { + p0 = cmp.eq(r28,#2) + if (p0.new) r1 = xor(r1,r3) + jumpr r31 + } + .falign +.Ladd_ovf_unf: + { + r1:0 = convert_d2df(r13:12) + p0 = cmp.eq(r13,#0) + p0 = cmp.eq(r12,#0) + if (p0.new) jump:nt .Ladd_zero + } + { + r28 = extractu(r1,#11,#20) + r1 += asl(r5,#20) + } + { + r5 = add(r5,r28) + r3:2 = combine(##0x00100000,#0) + } + { + p0 = cmp.gt(r5,##1024 +1024 -2) + if (p0.new) jump:nt .Ladd_ovf + } + { + p0 = cmp.gt(r5,#0) + if (p0.new) jumpr:t r31 + r28 = sub(#1,r5) + } + { + r3:2 = insert(r1:0,#52,#0) + r1:0 = r13:12 + } + { + r3:2 = lsr(r3:2,r28) + } + { + r1:0 = insert(r3:2,#63,#0) + jumpr r31 + } + .falign +.Ladd_ovf: + + { + r1:0 = r13:12 + r28 = USR + r13:12 = combine(##0x7fefffff,#-1) + } + { + r5 = extractu(r28,#2,#22) + r28 = or(r28,#0x28) + r9:8 = combine(##0x7ff00000,#0) + } + { + USR = r28 + r5 ^= lsr(r1,#31) + r28 = r5 + } + { + p0 = !cmp.eq(r28,#1) + p0 = !cmp.eq(r5,#2) + if (p0.new) r13:12 = r9:8 + } + { + r1:0 = insert(r13:12,#63,#0) + } + { + p0 = dfcmp.eq(r1:0,r1:0) + jumpr r31 + } + +.Ladd_abnormal: + { + r13:12 = extractu(r1:0,#63,#0) + r9:8 = extractu(r3:2,#63,#0) + } + { + p3 = cmp.gtu(r13:12,r9:8) + if (!p3.new) r1:0 = r3:2 + if (!p3.new) r3:2 = r1:0 + } + { + + p0 = dfclass(r1:0,#0x0f) + if (!p0.new) jump:nt .Linvalid_nan_add + if (!p3) r13:12 = r9:8 + if (!p3) r9:8 = r13:12 + } + { + + + p1 = dfclass(r1:0,#0x08) + if (p1.new) jump:nt .Linf_add + } + { + p2 = dfclass(r3:2,#0x01) + if (p2.new) jump:nt .LB_zero + r13:12 = #0 + } + + { + p0 = dfclass(r1:0,#4) + if (p0.new) jump:nt .Ladd_two_subnormal + r13:12 = combine(##0x20000000,#0) + } + { + r4 = extractu(r1,#11,#20) + r5 = #1 + + r9:8 = asl(r9:8,#11 -2) + } + + + + { + r13:12 = insert(r1:0,#52,#11 -2) + r15 = sub(r4,r5) + r7:6 = combine(#62,#1) + jump .Ladd_continue + } + +.Ladd_two_subnormal: + { + r13:12 = extractu(r1:0,#63,#0) + r9:8 = extractu(r3:2,#63,#0) + } + { + r13:12 = neg(r13:12) + r9:8 = neg(r9:8) + p0 = cmp.gt(r1,#-1) + p1 = cmp.gt(r3,#-1) + } + { + if (p0) r13:12 = r1:0 + if (p1) r9:8 = r3:2 + } + { + r13:12 = add(r13:12,r9:8) + } + { + r9:8 = neg(r13:12) + p0 = cmp.gt(r13,#-1) + r3:2 = #0 + } + { + if (!p0) r1:0 = r9:8 + if (p0) r1:0 = r13:12 + r3 = ##0x80000000 + } + { + if (!p0) r1 = or(r1,r3) + p0 = dfcmp.eq(r1:0,r3:2) + if (p0.new) jump:nt .Lzero_plus_zero + } + { + jumpr r31 + } + +.Linvalid_nan_add: + { + r28 = convert_df2sf(r1:0) + p0 = dfclass(r3:2,#0x0f) + if (p0.new) r3:2 = r1:0 + } + { + r2 = convert_df2sf(r3:2) + r1:0 = #-1 + jumpr r31 + } + .falign +.LB_zero: + { + p0 = dfcmp.eq(r13:12,r1:0) + if (!p0.new) jumpr:t r31 + } + + + + +.Lzero_plus_zero: + { + p0 = cmp.eq(r1:0,r3:2) + if (p0.new) jumpr:t r31 + } + { + r28 = USR + } + { + r28 = extractu(r28,#2,#22) + r1:0 = #0 + } + { + p0 = cmp.eq(r28,#2) + if (p0.new) r1 = ##0x80000000 + jumpr r31 + } +.Linf_add: + + { + p0 = !cmp.eq(r1,r3) + p0 = dfclass(r3:2,#8) + if (!p0.new) jumpr:t r31 + } + { + r2 = ##0x7f800001 + } + { + r1:0 = convert_sf2df(r2) + jumpr r31 + } +.size __hexagon_adddf3,.-__hexagon_adddf3 diff --git a/src/hexagon/dfdiv.s b/src/hexagon/dfdiv.s new file mode 100644 index 00000000..6d65dbfc --- /dev/null +++ b/src/hexagon/dfdiv.s @@ -0,0 +1,372 @@ + .text + .global __hexagon_divdf3 + .type __hexagon_divdf3,@function + .global __qdsp_divdf3 ; .set __qdsp_divdf3, __hexagon_divdf3 + .global __hexagon_fast_divdf3 ; .set __hexagon_fast_divdf3, __hexagon_divdf3 + .global __hexagon_fast2_divdf3 ; .set __hexagon_fast2_divdf3, __hexagon_divdf3 + .p2align 5 +__hexagon_divdf3: + { + p2 = dfclass(r1:0,#0x02) + p2 = dfclass(r3:2,#0x02) + r13:12 = combine(r3,r1) + r28 = xor(r1,r3) + } + { + if (!p2) jump .Ldiv_abnormal + r7:6 = extractu(r3:2,#23,#52 -23) + r8 = ##0x3f800001 + } + { + r9 = or(r8,r6) + r13 = extractu(r13,#11,#52 -32) + r12 = extractu(r12,#11,#52 -32) + p3 = cmp.gt(r28,#-1) + } + + +.Ldenorm_continue: + { + r11,p0 = sfrecipa(r8,r9) + r10 = and(r8,#-2) + r28 = #1 + r12 = sub(r12,r13) + } + + + { + r10 -= sfmpy(r11,r9):lib + r1 = insert(r28,#11 +1,#52 -32) + r13 = ##0x00800000 << 3 + } + { + r11 += sfmpy(r11,r10):lib + r3 = insert(r28,#11 +1,#52 -32) + r10 = and(r8,#-2) + } + { + r10 -= sfmpy(r11,r9):lib + r5 = #-0x3ff +1 + r4 = #0x3ff -1 + } + { + r11 += sfmpy(r11,r10):lib + p1 = cmp.gt(r12,r5) + p1 = !cmp.gt(r12,r4) + } + { + r13 = insert(r11,#23,#3) + r5:4 = #0 + r12 = add(r12,#-61) + } + + + + + { + r13 = add(r13,#((-3) << 3)) + } + { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASL(r7:6, # ( 14 )); r1:0 -= asl(r15:14, # 32); } + { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 1 )); r1:0 -= asl(r15:14, # 32); } + { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 16 )); r1:0 -= asl(r15:14, # 32); } + { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 31 )); r1:0 -= asl(r15:14, # 32); r7:6=# ( 0 ); } + + + + + + + + { + + r15:14 = sub(r1:0,r3:2) + p0 = cmp.gtu(r3:2,r1:0) + + if (!p0.new) r6 = #2 + } + { + r5:4 = add(r5:4,r7:6) + if (!p0) r1:0 = r15:14 + r15:14 = #0 + } + { + p0 = cmp.eq(r1:0,r15:14) + if (!p0.new) r4 = or(r4,r28) + } + { + r7:6 = neg(r5:4) + } + { + if (!p3) r5:4 = r7:6 + } + { + r1:0 = convert_d2df(r5:4) + if (!p1) jump .Ldiv_ovf_unf + } + { + r1 += asl(r12,#52 -32) + jumpr r31 + } + +.Ldiv_ovf_unf: + { + r1 += asl(r12,#52 -32) + r13 = extractu(r1,#11,#52 -32) + } + { + r7:6 = abs(r5:4) + r12 = add(r12,r13) + } + { + p0 = cmp.gt(r12,##0x3ff +0x3ff) + if (p0.new) jump:nt .Ldiv_ovf + } + { + p0 = cmp.gt(r12,#0) + if (p0.new) jump:nt .Lpossible_unf2 + } + { + r13 = add(clb(r7:6),#-1) + r12 = sub(#7,r12) + r10 = USR + r11 = #63 + } + { + r13 = min(r12,r11) + r11 = or(r10,#0x030) + r7:6 = asl(r7:6,r13) + r12 = #0 + } + { + r15:14 = extractu(r7:6,r13:12) + r7:6 = lsr(r7:6,r13) + r3:2 = #1 + } + { + p0 = cmp.gtu(r3:2,r15:14) + if (!p0.new) r6 = or(r2,r6) + r7 = setbit(r7,#52 -32+4) + } + { + r5:4 = neg(r7:6) + p0 = bitsclr(r6,#(1<<4)-1) + if (!p0.new) r10 = r11 + } + { + USR = r10 + if (p3) r5:4 = r7:6 + r10 = #-0x3ff -(52 +4) + } + { + r1:0 = convert_d2df(r5:4) + } + { + r1 += asl(r10,#52 -32) + jumpr r31 + } + + +.Lpossible_unf2: + + + { + r3:2 = extractu(r1:0,#63,#0) + r15:14 = combine(##0x00100000,#0) + r10 = #0x7FFF + } + { + p0 = dfcmp.eq(r15:14,r3:2) + p0 = bitsset(r7,r10) + } + + + + + + + { + if (!p0) jumpr r31 + r10 = USR + } + + { + r10 = or(r10,#0x30) + } + { + USR = r10 + } + { + p0 = dfcmp.eq(r1:0,r1:0) + jumpr r31 + } + +.Ldiv_ovf: + + + + { + r10 = USR + r3:2 = combine(##0x7fefffff,#-1) + r1 = mux(p3,#0,#-1) + } + { + r7:6 = combine(##0x7ff00000,#0) + r5 = extractu(r10,#2,#22) + r10 = or(r10,#0x28) + } + { + USR = r10 + r5 ^= lsr(r1,#31) + r4 = r5 + } + { + p0 = !cmp.eq(r4,#1) + p0 = !cmp.eq(r5,#2) + if (p0.new) r3:2 = r7:6 + p0 = dfcmp.eq(r3:2,r3:2) + } + { + r1:0 = insert(r3:2,#63,#0) + jumpr r31 + } + + + + + + + +.Ldiv_abnormal: + { + p0 = dfclass(r1:0,#0x0F) + p0 = dfclass(r3:2,#0x0F) + p3 = cmp.gt(r28,#-1) + } + { + p1 = dfclass(r1:0,#0x08) + p1 = dfclass(r3:2,#0x08) + } + { + p2 = dfclass(r1:0,#0x01) + p2 = dfclass(r3:2,#0x01) + } + { + if (!p0) jump .Ldiv_nan + if (p1) jump .Ldiv_invalid + } + { + if (p2) jump .Ldiv_invalid + } + { + p2 = dfclass(r1:0,#(0x0F ^ 0x01)) + p2 = dfclass(r3:2,#(0x0F ^ 0x08)) + } + { + p1 = dfclass(r1:0,#(0x0F ^ 0x08)) + p1 = dfclass(r3:2,#(0x0F ^ 0x01)) + } + { + if (!p2) jump .Ldiv_zero_result + if (!p1) jump .Ldiv_inf_result + } + + + + + + { + p0 = dfclass(r1:0,#0x02) + p1 = dfclass(r3:2,#0x02) + r10 = ##0x00100000 + } + { + r13:12 = combine(r3,r1) + r1 = insert(r10,#11 +1,#52 -32) + r3 = insert(r10,#11 +1,#52 -32) + } + { + if (p0) r1 = or(r1,r10) + if (p1) r3 = or(r3,r10) + } + { + r5 = add(clb(r1:0),#-11) + r4 = add(clb(r3:2),#-11) + r10 = #1 + } + { + r12 = extractu(r12,#11,#52 -32) + r13 = extractu(r13,#11,#52 -32) + } + { + r1:0 = asl(r1:0,r5) + r3:2 = asl(r3:2,r4) + if (!p0) r12 = sub(r10,r5) + if (!p1) r13 = sub(r10,r4) + } + { + r7:6 = extractu(r3:2,#23,#52 -23) + } + { + r9 = or(r8,r6) + jump .Ldenorm_continue + } + +.Ldiv_zero_result: + { + r1 = xor(r1,r3) + r3:2 = #0 + } + { + r1:0 = insert(r3:2,#63,#0) + jumpr r31 + } +.Ldiv_inf_result: + { + p2 = dfclass(r3:2,#0x01) + p2 = dfclass(r1:0,#(0x0F ^ 0x08)) + } + { + r10 = USR + if (!p2) jump 1f + r1 = xor(r1,r3) + } + { + r10 = or(r10,#0x04) + } + { + USR = r10 + } +1: + { + r3:2 = combine(##0x7ff00000,#0) + p0 = dfcmp.uo(r3:2,r3:2) + } + { + r1:0 = insert(r3:2,#63,#0) + jumpr r31 + } +.Ldiv_nan: + { + p0 = dfclass(r1:0,#0x10) + p1 = dfclass(r3:2,#0x10) + if (!p0.new) r1:0 = r3:2 + if (!p1.new) r3:2 = r1:0 + } + { + r5 = convert_df2sf(r1:0) + r4 = convert_df2sf(r3:2) + } + { + r1:0 = #-1 + jumpr r31 + } + +.Ldiv_invalid: + { + r10 = ##0x7f800001 + } + { + r1:0 = convert_sf2df(r10) + jumpr r31 + } +.size __hexagon_divdf3,.-__hexagon_divdf3 diff --git a/src/hexagon/dffma.s b/src/hexagon/dffma.s new file mode 100644 index 00000000..043a1d29 --- /dev/null +++ b/src/hexagon/dffma.s @@ -0,0 +1,536 @@ + .text + .global __hexagon_fmadf4 + .type __hexagon_fmadf4,@function + .global __hexagon_fmadf5 + .type __hexagon_fmadf5,@function + .global fma + .type fma,@function + .global __qdsp_fmadf5 ; .set __qdsp_fmadf5, __hexagon_fmadf5 + .p2align 5 +__hexagon_fmadf4: +__hexagon_fmadf5: +fma: + { + p0 = dfclass(r1:0,#2) + p0 = dfclass(r3:2,#2) + r13:12 = #0 + r15:14 = #0 + } + { + r13:12 = insert(r1:0,#52,#11 -3) + r15:14 = insert(r3:2,#52,#11 -3) + r7 = ##0x10000000 + allocframe(#32) + } + { + r9:8 = mpyu(r12,r14) + if (!p0) jump .Lfma_abnormal_ab + r13 = or(r13,r7) + r15 = or(r15,r7) + } + { + p0 = dfclass(r5:4,#2) + if (!p0.new) jump:nt .Lfma_abnormal_c + r11:10 = combine(r7,#0) + r7:6 = combine(#0,r9) + } +.Lfma_abnormal_c_restart: + { + r7:6 += mpyu(r14,r13) + r11:10 = insert(r5:4,#52,#11 -3) + memd(r29+#0) = r17:16 + memd(r29+#8) = r19:18 + } + { + r7:6 += mpyu(r12,r15) + r19:18 = neg(r11:10) + p0 = cmp.gt(r5,#-1) + r28 = xor(r1,r3) + } + { + r18 = extractu(r1,#11,#20) + r19 = extractu(r3,#11,#20) + r17:16 = combine(#0,r7) + if (!p0) r11:10 = r19:18 + } + { + r17:16 += mpyu(r13,r15) + r9:8 = combine(r6,r8) + r18 = add(r18,r19) + + + + + r19 = extractu(r5,#11,#20) + } + { + r18 = add(r18,#-1023 +(4)) + p3 = !cmp.gt(r28,#-1) + r7:6 = #0 + r15:14 = #0 + } + { + r7:6 = sub(r7:6,r9:8,p3):carry + p0 = !cmp.gt(r28,#-1) + p1 = cmp.gt(r19,r18) + if (p1.new) r19:18 = combine(r18,r19) + } + { + r15:14 = sub(r15:14,r17:16,p3):carry + if (p0) r9:8 = r7:6 + + + + + r7:6 = #0 + r19 = sub(r18,r19) + } + { + if (p0) r17:16 = r15:14 + p0 = cmp.gt(r19,#63) + if (p1) r9:8 = r7:6 + if (p1) r7:6 = r9:8 + } + + + + + + + + { + if (p1) r17:16 = r11:10 + if (p1) r11:10 = r17:16 + if (p0) r19 = add(r19,#-64) + r28 = #63 + } + { + + if (p0) r7:6 = r11:10 + r28 = asr(r11,#31) + r13 = min(r19,r28) + r12 = #0 + } + + + + + + + { + if (p0) r11:10 = combine(r28,r28) + r5:4 = extract(r7:6,r13:12) + r7:6 = lsr(r7:6,r13) + r12 = sub(#64,r13) + } + { + r15:14 = #0 + r28 = #-2 + r7:6 |= lsl(r11:10,r12) + r11:10 = asr(r11:10,r13) + } + { + p3 = cmp.gtu(r5:4,r15:14) + if (p3.new) r6 = and(r6,r28) + + + + r15:14 = #1 + r5:4 = #0 + } + { + r9:8 = add(r7:6,r9:8,p3):carry + } + { + r17:16 = add(r11:10,r17:16,p3):carry + r28 = #62 + } + + + + + + + + { + r12 = add(clb(r17:16),#-2) + if (!cmp.eq(r12.new,r28)) jump:t 1f + } + + { + r11:10 = extractu(r9:8,#62,#2) + r9:8 = asl(r9:8,#62) + r18 = add(r18,#-62) + } + { + r17:16 = insert(r11:10,#62,#0) + } + { + r12 = add(clb(r17:16),#-2) + } + .falign +1: + { + r11:10 = asl(r17:16,r12) + r5:4 |= asl(r9:8,r12) + r13 = sub(#64,r12) + r18 = sub(r18,r12) + } + { + r11:10 |= lsr(r9:8,r13) + p2 = cmp.gtu(r15:14,r5:4) + r28 = #1023 +1023 -2 + } + { + if (!p2) r10 = or(r10,r14) + + p0 = !cmp.gt(r18,r28) + p0 = cmp.gt(r18,#1) + if (!p0.new) jump:nt .Lfma_ovf_unf + } + { + + p0 = cmp.gtu(r15:14,r11:10) + r1:0 = convert_d2df(r11:10) + r18 = add(r18,#-1023 -60) + r17:16 = memd(r29+#0) + } + { + r1 += asl(r18,#20) + r19:18 = memd(r29+#8) + if (!p0) dealloc_return + } +.Ladd_yields_zero: + + { + r28 = USR + r1:0 = #0 + } + { + r28 = extractu(r28,#2,#22) + r17:16 = memd(r29+#0) + r19:18 = memd(r29+#8) + } + { + p0 = cmp.eq(r28,#2) + if (p0.new) r1 = ##0x80000000 + dealloc_return + } +.Lfma_ovf_unf: + { + p0 = cmp.gtu(r15:14,r11:10) + if (p0.new) jump:nt .Ladd_yields_zero + } + { + r1:0 = convert_d2df(r11:10) + r18 = add(r18,#-1023 -60) + r28 = r18 + } + + + { + r1 += asl(r18,#20) + r7 = extractu(r1,#11,#20) + } + { + r6 = add(r18,r7) + r17:16 = memd(r29+#0) + r19:18 = memd(r29+#8) + r9:8 = abs(r11:10) + } + { + p0 = cmp.gt(r6,##1023 +1023) + if (p0.new) jump:nt .Lfma_ovf + } + { + p0 = cmp.gt(r6,#0) + if (p0.new) jump:nt .Lpossible_unf0 + } + { + + + + r7 = add(clb(r9:8),#-2) + r6 = sub(#1+5,r28) + p3 = cmp.gt(r11,#-1) + } + + + + { + r6 = add(r6,r7) + r9:8 = asl(r9:8,r7) + r1 = USR + r28 = #63 + } + { + r7 = min(r6,r28) + r6 = #0 + r0 = #0x0030 + } + { + r3:2 = extractu(r9:8,r7:6) + r9:8 = asr(r9:8,r7) + } + { + p0 = cmp.gtu(r15:14,r3:2) + if (!p0.new) r8 = or(r8,r14) + r9 = setbit(r9,#20 +3) + } + { + r11:10 = neg(r9:8) + p1 = bitsclr(r8,#(1<<3)-1) + if (!p1.new) r1 = or(r1,r0) + r3:2 = #0 + } + { + if (p3) r11:10 = r9:8 + USR = r1 + r28 = #-1023 -(52 +3) + } + { + r1:0 = convert_d2df(r11:10) + } + { + r1 += asl(r28,#20) + dealloc_return + } +.Lpossible_unf0: + { + r28 = ##0x7fefffff + r9:8 = abs(r11:10) + } + { + p0 = cmp.eq(r0,#0) + p0 = bitsclr(r1,r28) + if (!p0.new) dealloc_return:t + r28 = #0x7fff + } + { + p0 = bitsset(r9,r28) + r3 = USR + r2 = #0x0030 + } + { + if (p0) r3 = or(r3,r2) + } + { + USR = r3 + } + { + p0 = dfcmp.eq(r1:0,r1:0) + dealloc_return + } +.Lfma_ovf: + { + r28 = USR + r11:10 = combine(##0x7fefffff,#-1) + r1:0 = r11:10 + } + { + r9:8 = combine(##0x7ff00000,#0) + r3 = extractu(r28,#2,#22) + r28 = or(r28,#0x28) + } + { + USR = r28 + r3 ^= lsr(r1,#31) + r2 = r3 + } + { + p0 = !cmp.eq(r2,#1) + p0 = !cmp.eq(r3,#2) + } + { + p0 = dfcmp.eq(r9:8,r9:8) + if (p0.new) r11:10 = r9:8 + } + { + r1:0 = insert(r11:10,#63,#0) + dealloc_return + } +.Lfma_abnormal_ab: + { + r9:8 = extractu(r1:0,#63,#0) + r11:10 = extractu(r3:2,#63,#0) + deallocframe + } + { + p3 = cmp.gtu(r9:8,r11:10) + if (!p3.new) r1:0 = r3:2 + if (!p3.new) r3:2 = r1:0 + } + { + p0 = dfclass(r1:0,#0x0f) + if (!p0.new) jump:nt .Lnan + if (!p3) r9:8 = r11:10 + if (!p3) r11:10 = r9:8 + } + { + p1 = dfclass(r1:0,#0x08) + p1 = dfclass(r3:2,#0x0e) + } + { + p0 = dfclass(r1:0,#0x08) + p0 = dfclass(r3:2,#0x01) + } + { + if (p1) jump .Lab_inf + p2 = dfclass(r3:2,#0x01) + } + { + if (p0) jump .Linvalid + if (p2) jump .Lab_true_zero + r28 = ##0x7c000000 + } + + + + + + { + p0 = bitsclr(r1,r28) + if (p0.new) jump:nt .Lfma_ab_tiny + } + { + r28 = add(clb(r11:10),#-11) + } + { + r11:10 = asl(r11:10,r28) + } + { + r3:2 = insert(r11:10,#63,#0) + r1 -= asl(r28,#20) + } + jump fma + +.Lfma_ab_tiny: + r9:8 = combine(##0x00100000,#0) + { + r1:0 = insert(r9:8,#63,#0) + r3:2 = insert(r9:8,#63,#0) + } + jump fma + +.Lab_inf: + { + r3:2 = lsr(r3:2,#63) + p0 = dfclass(r5:4,#0x10) + } + { + r1:0 ^= asl(r3:2,#63) + if (p0) jump .Lnan + } + { + p1 = dfclass(r5:4,#0x08) + if (p1.new) jump:nt .Lfma_inf_plus_inf + } + + { + jumpr r31 + } + .falign +.Lfma_inf_plus_inf: + { + p0 = dfcmp.eq(r1:0,r5:4) + if (!p0.new) jump:nt .Linvalid + } + { + jumpr r31 + } + +.Lnan: + { + p0 = dfclass(r3:2,#0x10) + p1 = dfclass(r5:4,#0x10) + if (!p0.new) r3:2 = r1:0 + if (!p1.new) r5:4 = r1:0 + } + { + r3 = convert_df2sf(r3:2) + r2 = convert_df2sf(r5:4) + } + { + r3 = convert_df2sf(r1:0) + r1:0 = #-1 + jumpr r31 + } + +.Linvalid: + { + r28 = ##0x7f800001 + } + { + r1:0 = convert_sf2df(r28) + jumpr r31 + } + +.Lab_true_zero: + + { + p0 = dfclass(r5:4,#0x10) + if (p0.new) jump:nt .Lnan + if (p0.new) r1:0 = r5:4 + } + { + p0 = dfcmp.eq(r3:2,r5:4) + r1 = lsr(r1,#31) + } + { + r3 ^= asl(r1,#31) + if (!p0) r1:0 = r5:4 + if (!p0) jumpr r31 + } + + { + p0 = cmp.eq(r3:2,r5:4) + if (p0.new) jumpr:t r31 + r1:0 = r3:2 + } + { + r28 = USR + } + { + r28 = extractu(r28,#2,#22) + r1:0 = #0 + } + { + p0 = cmp.eq(r28,#2) + if (p0.new) r1 = ##0x80000000 + jumpr r31 + } + + + + + .falign +.Lfma_abnormal_c: + + + { + p0 = dfclass(r5:4,#0x10) + if (p0.new) jump:nt .Lnan + if (p0.new) r1:0 = r5:4 + deallocframe + } + { + p0 = dfclass(r5:4,#0x08) + if (p0.new) r1:0 = r5:4 + if (p0.new) jumpr:nt r31 + } + + + { + p0 = dfclass(r5:4,#0x01) + if (p0.new) jump:nt __hexagon_muldf3 + r28 = #1 + } + + + { + allocframe(#32) + r11:10 = #0 + r5 = insert(r28,#11,#20) + jump .Lfma_abnormal_c_restart + } +.size fma,.-fma diff --git a/src/hexagon/dfminmax.s b/src/hexagon/dfminmax.s new file mode 100644 index 00000000..3337a322 --- /dev/null +++ b/src/hexagon/dfminmax.s @@ -0,0 +1,51 @@ + .text + .global __hexagon_mindf3 + .global __hexagon_maxdf3 + .global fmin + .type fmin,@function + .global fmax + .type fmax,@function + .type __hexagon_mindf3,@function + .type __hexagon_maxdf3,@function + .global __qdsp_mindf3 ; .set __qdsp_mindf3, __hexagon_mindf3 + .global __qdsp_maxdf3 ; .set __qdsp_maxdf3, __hexagon_maxdf3 + .p2align 5 +__hexagon_mindf3: +fmin: + { + p0 = dfclass(r1:0,#0x10) + p1 = dfcmp.gt(r1:0,r3:2) + r5:4 = r1:0 + } + { + if (p0) r1:0 = r3:2 + if (p1) r1:0 = r3:2 + p2 = dfcmp.eq(r1:0,r3:2) + if (!p2.new) jumpr:t r31 + } + + { + r1:0 = or(r5:4,r3:2) + jumpr r31 + } +.size __hexagon_mindf3,.-__hexagon_mindf3 + .falign +__hexagon_maxdf3: +fmax: + { + p0 = dfclass(r1:0,#0x10) + p1 = dfcmp.gt(r3:2,r1:0) + r5:4 = r1:0 + } + { + if (p0) r1:0 = r3:2 + if (p1) r1:0 = r3:2 + p2 = dfcmp.eq(r1:0,r3:2) + if (!p2.new) jumpr:t r31 + } + + { + r1:0 = and(r5:4,r3:2) + jumpr r31 + } +.size __hexagon_maxdf3,.-__hexagon_maxdf3 diff --git a/src/hexagon/dfmul.s b/src/hexagon/dfmul.s new file mode 100644 index 00000000..32fc674f --- /dev/null +++ b/src/hexagon/dfmul.s @@ -0,0 +1,309 @@ + .text + .global __hexagon_muldf3 + .type __hexagon_muldf3,@function + .global __qdsp_muldf3 ; .set __qdsp_muldf3, __hexagon_muldf3 + .global __hexagon_fast_muldf3 ; .set __hexagon_fast_muldf3, __hexagon_muldf3 + .global __hexagon_fast2_muldf3 ; .set __hexagon_fast2_muldf3, __hexagon_muldf3 + .p2align 5 +__hexagon_muldf3: + { + p0 = dfclass(r1:0,#2) + p0 = dfclass(r3:2,#2) + r13:12 = combine(##0x40000000,#0) + } + { + r13:12 = insert(r1:0,#52,#11 -1) + r5:4 = asl(r3:2,#11 -1) + r28 = #-1024 + r9:8 = #1 + } + { + r7:6 = mpyu(r4,r13) + r5:4 = insert(r9:8,#2,#62) + } + + + + + { + r15:14 = mpyu(r12,r4) + r7:6 += mpyu(r12,r5) + } + { + r7:6 += lsr(r15:14,#32) + r11:10 = mpyu(r13,r5) + r5:4 = combine(##1024 +1024 -4,#0) + } + { + r11:10 += lsr(r7:6,#32) + if (!p0) jump .Lmul_abnormal + p1 = cmp.eq(r14,#0) + p1 = cmp.eq(r6,#0) + } + { + if (!p1) r10 = or(r10,r8) + r6 = extractu(r1,#11,#20) + r7 = extractu(r3,#11,#20) + } + { + r15:14 = neg(r11:10) + r6 += add(r28,r7) + r28 = xor(r1,r3) + } + { + if (!p2.new) r11:10 = r15:14 + p2 = cmp.gt(r28,#-1) + p0 = !cmp.gt(r6,r5) + p0 = cmp.gt(r6,r4) + if (!p0.new) jump:nt .Lmul_ovf_unf + } + { + r1:0 = convert_d2df(r11:10) + r6 = add(r6,#-1024 -58) + } + { + r1 += asl(r6,#20) + jumpr r31 + } + + .falign +.Lpossible_unf1: + { + p0 = cmp.eq(r0,#0) + p0 = bitsclr(r1,r4) + if (!p0.new) jumpr:t r31 + r5 = #0x7fff + } + { + p0 = bitsset(r13,r5) + r4 = USR + r5 = #0x030 + } + { + if (p0) r4 = or(r4,r5) + } + { + USR = r4 + } + { + p0 = dfcmp.eq(r1:0,r1:0) + jumpr r31 + } + .falign +.Lmul_ovf_unf: + { + r1:0 = convert_d2df(r11:10) + r13:12 = abs(r11:10) + r7 = add(r6,#-1024 -58) + } + { + r1 += asl(r7,#20) + r7 = extractu(r1,#11,#20) + r4 = ##0x7FEFFFFF + } + { + r7 += add(r6,##-1024 -58) + + r5 = #0 + } + { + p0 = cmp.gt(r7,##1024 +1024 -2) + if (p0.new) jump:nt .Lmul_ovf + } + { + p0 = cmp.gt(r7,#0) + if (p0.new) jump:nt .Lpossible_unf1 + r5 = sub(r6,r5) + r28 = #63 + } + { + r4 = #0 + r5 = sub(#5,r5) + } + { + p3 = cmp.gt(r11,#-1) + r5 = min(r5,r28) + r11:10 = r13:12 + } + { + r28 = USR + r15:14 = extractu(r11:10,r5:4) + } + { + r11:10 = asr(r11:10,r5) + r4 = #0x0030 + r1 = insert(r9,#11,#20) + } + { + p0 = cmp.gtu(r9:8,r15:14) + if (!p0.new) r10 = or(r10,r8) + r11 = setbit(r11,#20 +3) + } + { + r15:14 = neg(r11:10) + p1 = bitsclr(r10,#0x7) + if (!p1.new) r28 = or(r4,r28) + } + { + if (!p3) r11:10 = r15:14 + USR = r28 + } + { + r1:0 = convert_d2df(r11:10) + p0 = dfcmp.eq(r1:0,r1:0) + } + { + r1 = insert(r9,#11 -1,#20 +1) + jumpr r31 + } + .falign +.Lmul_ovf: + + { + r28 = USR + r13:12 = combine(##0x7fefffff,#-1) + r1:0 = r11:10 + } + { + r14 = extractu(r28,#2,#22) + r28 = or(r28,#0x28) + r5:4 = combine(##0x7ff00000,#0) + } + { + USR = r28 + r14 ^= lsr(r1,#31) + r28 = r14 + } + { + p0 = !cmp.eq(r28,#1) + p0 = !cmp.eq(r14,#2) + if (p0.new) r13:12 = r5:4 + p0 = dfcmp.eq(r1:0,r1:0) + } + { + r1:0 = insert(r13:12,#63,#0) + jumpr r31 + } + +.Lmul_abnormal: + { + r13:12 = extractu(r1:0,#63,#0) + r5:4 = extractu(r3:2,#63,#0) + } + { + p3 = cmp.gtu(r13:12,r5:4) + if (!p3.new) r1:0 = r3:2 + if (!p3.new) r3:2 = r1:0 + } + { + + p0 = dfclass(r1:0,#0x0f) + if (!p0.new) jump:nt .Linvalid_nan + if (!p3) r13:12 = r5:4 + if (!p3) r5:4 = r13:12 + } + { + + p1 = dfclass(r1:0,#0x08) + p1 = dfclass(r3:2,#0x0e) + } + { + + + p0 = dfclass(r1:0,#0x08) + p0 = dfclass(r3:2,#0x01) + } + { + if (p1) jump .Ltrue_inf + p2 = dfclass(r3:2,#0x01) + } + { + if (p0) jump .Linvalid_zeroinf + if (p2) jump .Ltrue_zero + r28 = ##0x7c000000 + } + + + + + + { + p0 = bitsclr(r1,r28) + if (p0.new) jump:nt .Lmul_tiny + } + { + r28 = cl0(r5:4) + } + { + r28 = add(r28,#-11) + } + { + r5:4 = asl(r5:4,r28) + } + { + r3:2 = insert(r5:4,#63,#0) + r1 -= asl(r28,#20) + } + jump __hexagon_muldf3 +.Lmul_tiny: + { + r28 = USR + r1:0 = xor(r1:0,r3:2) + } + { + r28 = or(r28,#0x30) + r1:0 = insert(r9:8,#63,#0) + r5 = extractu(r28,#2,#22) + } + { + USR = r28 + p0 = cmp.gt(r5,#1) + if (!p0.new) r0 = #0 + r5 ^= lsr(r1,#31) + } + { + p0 = cmp.eq(r5,#3) + if (!p0.new) r0 = #0 + jumpr r31 + } +.Linvalid_zeroinf: + { + r28 = USR + } + { + r1:0 = #-1 + r28 = or(r28,#2) + } + { + USR = r28 + } + { + p0 = dfcmp.uo(r1:0,r1:0) + jumpr r31 + } +.Linvalid_nan: + { + p0 = dfclass(r3:2,#0x0f) + r28 = convert_df2sf(r1:0) + if (p0.new) r3:2 = r1:0 + } + { + r2 = convert_df2sf(r3:2) + r1:0 = #-1 + jumpr r31 + } + .falign +.Ltrue_zero: + { + r1:0 = r3:2 + r3:2 = r1:0 + } +.Ltrue_inf: + { + r3 = extract(r3,#1,#31) + } + { + r1 ^= asl(r3,#31) + jumpr r31 + } +.size __hexagon_muldf3,.-__hexagon_muldf3 diff --git a/src/hexagon/dfsqrt.s b/src/hexagon/dfsqrt.s new file mode 100644 index 00000000..14f584a1 --- /dev/null +++ b/src/hexagon/dfsqrt.s @@ -0,0 +1,277 @@ + .text + .global __hexagon_sqrtdf2 + .type __hexagon_sqrtdf2,@function + .global __hexagon_sqrt + .type __hexagon_sqrt,@function + .global __qdsp_sqrtdf2 ; .set __qdsp_sqrtdf2, __hexagon_sqrtdf2; .type __qdsp_sqrtdf2,@function + .global __qdsp_sqrt ; .set __qdsp_sqrt, __hexagon_sqrt; .type __qdsp_sqrt,@function + .global __hexagon_fast_sqrtdf2 ; .set __hexagon_fast_sqrtdf2, __hexagon_sqrtdf2; .type __hexagon_fast_sqrtdf2,@function + .global __hexagon_fast_sqrt ; .set __hexagon_fast_sqrt, __hexagon_sqrt; .type __hexagon_fast_sqrt,@function + .global __hexagon_fast2_sqrtdf2 ; .set __hexagon_fast2_sqrtdf2, __hexagon_sqrtdf2; .type __hexagon_fast2_sqrtdf2,@function + .global __hexagon_fast2_sqrt ; .set __hexagon_fast2_sqrt, __hexagon_sqrt; .type __hexagon_fast2_sqrt,@function + .type sqrt,@function + .p2align 5 +__hexagon_sqrtdf2: +__hexagon_sqrt: + { + r15:14 = extractu(r1:0,#23 +1,#52 -23) + r28 = extractu(r1,#11,#52 -32) + r5:4 = combine(##0x3f000004,#1) + } + { + p2 = dfclass(r1:0,#0x02) + p2 = cmp.gt(r1,#-1) + if (!p2.new) jump:nt .Lsqrt_abnormal + r9 = or(r5,r14) + } + +.Ldenormal_restart: + { + r11:10 = r1:0 + r7,p0 = sfinvsqrta(r9) + r5 = and(r5,#-16) + r3:2 = #0 + } + { + r3 += sfmpy(r7,r9):lib + r2 += sfmpy(r7,r5):lib + r6 = r5 + + + r9 = and(r28,#1) + } + { + r6 -= sfmpy(r3,r2):lib + r11 = insert(r4,#11 +1,#52 -32) + p1 = cmp.gtu(r9,#0) + } + { + r3 += sfmpy(r3,r6):lib + r2 += sfmpy(r2,r6):lib + r6 = r5 + r9 = mux(p1,#8,#9) + } + { + r6 -= sfmpy(r3,r2):lib + r11:10 = asl(r11:10,r9) + r9 = mux(p1,#3,#2) + } + { + r2 += sfmpy(r2,r6):lib + + r15:14 = asl(r11:10,r9) + } + { + r2 = and(r2,##0x007fffff) + } + { + r2 = add(r2,##0x00800000 - 3) + r9 = mux(p1,#7,#8) + } + { + r8 = asl(r2,r9) + r9 = mux(p1,#15-(1+1),#15-(1+0)) + } + { + r13:12 = mpyu(r8,r15) + } + { + r1:0 = asl(r11:10,#15) + r15:14 = mpyu(r13,r13) + p1 = cmp.eq(r0,r0) + } + { + r1:0 -= asl(r15:14,#15) + r15:14 = mpyu(r13,r12) + p2 = cmp.eq(r0,r0) + } + { + r1:0 -= lsr(r15:14,#16) + p3 = cmp.eq(r0,r0) + } + { + r1:0 = mpyu(r1,r8) + } + { + r13:12 += lsr(r1:0,r9) + r9 = add(r9,#16) + r1:0 = asl(r11:10,#31) + } + + { + r15:14 = mpyu(r13,r13) + r1:0 -= mpyu(r13,r12) + } + { + r1:0 -= asl(r15:14,#31) + r15:14 = mpyu(r12,r12) + } + { + r1:0 -= lsr(r15:14,#33) + } + { + r1:0 = mpyu(r1,r8) + } + { + r13:12 += lsr(r1:0,r9) + r9 = add(r9,#16) + r1:0 = asl(r11:10,#47) + } + + { + r15:14 = mpyu(r13,r13) + } + { + r1:0 -= asl(r15:14,#47) + r15:14 = mpyu(r13,r12) + } + { + r1:0 -= asl(r15:14,#16) + r15:14 = mpyu(r12,r12) + } + { + r1:0 -= lsr(r15:14,#17) + } + { + r1:0 = mpyu(r1,r8) + } + { + r13:12 += lsr(r1:0,r9) + } + { + r3:2 = mpyu(r13,r12) + r5:4 = mpyu(r12,r12) + r15:14 = #0 + r1:0 = #0 + } + { + r3:2 += lsr(r5:4,#33) + r5:4 += asl(r3:2,#33) + p1 = cmp.eq(r0,r0) + } + { + r7:6 = mpyu(r13,r13) + r1:0 = sub(r1:0,r5:4,p1):carry + r9:8 = #1 + } + { + r7:6 += lsr(r3:2,#31) + r9:8 += asl(r13:12,#1) + } + + + + + + { + r15:14 = sub(r11:10,r7:6,p1):carry + r5:4 = sub(r1:0,r9:8,p2):carry + + + + + r7:6 = #1 + r11:10 = #0 + } + { + r3:2 = sub(r15:14,r11:10,p2):carry + r7:6 = add(r13:12,r7:6) + r28 = add(r28,#-0x3ff) + } + { + + if (p2) r13:12 = r7:6 + if (p2) r1:0 = r5:4 + if (p2) r15:14 = r3:2 + } + { + r5:4 = sub(r1:0,r9:8,p3):carry + r7:6 = #1 + r28 = asr(r28,#1) + } + { + r3:2 = sub(r15:14,r11:10,p3):carry + r7:6 = add(r13:12,r7:6) + } + { + if (p3) r13:12 = r7:6 + if (p3) r1:0 = r5:4 + + + + + + r2 = #1 + } + { + p0 = cmp.eq(r1:0,r11:10) + if (!p0.new) r12 = or(r12,r2) + r3 = cl0(r13:12) + r28 = add(r28,#-63) + } + + + + { + r1:0 = convert_ud2df(r13:12) + r28 = add(r28,r3) + } + { + r1 += asl(r28,#52 -32) + jumpr r31 + } +.Lsqrt_abnormal: + { + p0 = dfclass(r1:0,#0x01) + if (p0.new) jumpr:t r31 + } + { + p0 = dfclass(r1:0,#0x10) + if (p0.new) jump:nt .Lsqrt_nan + } + { + p0 = cmp.gt(r1,#-1) + if (!p0.new) jump:nt .Lsqrt_invalid_neg + if (!p0.new) r28 = ##0x7F800001 + } + { + p0 = dfclass(r1:0,#0x08) + if (p0.new) jumpr:nt r31 + } + + + { + r1:0 = extractu(r1:0,#52,#0) + } + { + r28 = add(clb(r1:0),#-11) + } + { + r1:0 = asl(r1:0,r28) + r28 = sub(#1,r28) + } + { + r1 = insert(r28,#1,#52 -32) + } + { + r3:2 = extractu(r1:0,#23 +1,#52 -23) + r5 = ##0x3f000004 + } + { + r9 = or(r5,r2) + r5 = and(r5,#-16) + jump .Ldenormal_restart + } +.Lsqrt_nan: + { + r28 = convert_df2sf(r1:0) + r1:0 = #-1 + jumpr r31 + } +.Lsqrt_invalid_neg: + { + r1:0 = convert_sf2df(r28) + jumpr r31 + } +.size __hexagon_sqrt,.-__hexagon_sqrt +.size __hexagon_sqrtdf2,.-__hexagon_sqrtdf2 diff --git a/src/hexagon/divdi3.s b/src/hexagon/divdi3.s new file mode 100644 index 00000000..0fee6e70 --- /dev/null +++ b/src/hexagon/divdi3.s @@ -0,0 +1,64 @@ + +FUNCTION_BEGIN __hexagon_divdi3 + { + p2 = tstbit(r1,#31) + p3 = tstbit(r3,#31) + } + { + r1:0 = abs(r1:0) + r3:2 = abs(r3:2) + } + { + r6 = cl0(r1:0) + r7 = cl0(r3:2) + r5:4 = r3:2 + r3:2 = r1:0 + } + { + p3 = xor(p2,p3) + r10 = sub(r7,r6) + r1:0 = #0 + r15:14 = #1 + } + { + r11 = add(r10,#1) + r13:12 = lsl(r5:4,r10) + r15:14 = lsl(r15:14,r10) + } + { + p0 = cmp.gtu(r5:4,r3:2) + loop0(1f,r11) + } + { + if (p0) jump .hexagon_divdi3_return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) + } + { + r7:6 = sub(r3:2, r13:12) + r9:8 = add(r1:0, r15:14) + } + { + r1:0 = vmux(p0, r1:0, r9:8) + r3:2 = vmux(p0, r3:2, r7:6) + } + { + r15:14 = lsr(r15:14, #1) + r13:12 = lsr(r13:12, #1) + }:endloop0 + +.hexagon_divdi3_return: + { + r3:2 = neg(r1:0) + } + { + r1:0 = vmux(p3,r3:2,r1:0) + jumpr r31 + } +FUNCTION_END __hexagon_divdi3 + + .globl __qdsp_divdi3 + .set __qdsp_divdi3, __hexagon_divdi3 diff --git a/src/hexagon/divsi3.s b/src/hexagon/divsi3.s new file mode 100644 index 00000000..fc957a43 --- /dev/null +++ b/src/hexagon/divsi3.s @@ -0,0 +1,53 @@ + +FUNCTION_BEGIN __hexagon_divsi3 + { + p0 = cmp.ge(r0,#0) + p1 = cmp.ge(r1,#0) + r1 = abs(r0) + r2 = abs(r1) + } + { + r3 = cl0(r1) + r4 = cl0(r2) + r5 = sub(r1,r2) + p2 = cmp.gtu(r2,r1) + } + { + r0 = #0 + p1 = xor(p0,p1) + p0 = cmp.gtu(r2,r5) + if (p2) jumpr r31 + } + + { + r0 = mux(p1,#-1,#1) + if (p0) jumpr r31 + r4 = sub(r4,r3) + r3 = #1 + } + { + r0 = #0 + r3:2 = vlslw(r3:2,r4) + loop0(1f,r4) + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r0 = add(r0,r3) + if (!p1) jumpr r31 + } + { + r0 = neg(r0) + jumpr r31 + } +FUNCTION_END __hexagon_divsi3 + + .globl __qdsp_divsi3 + .set __qdsp_divsi3, __hexagon_divsi3 diff --git a/src/hexagon/fastmath2_dlib_asm.s b/src/hexagon/fastmath2_dlib_asm.s new file mode 100644 index 00000000..15c38784 --- /dev/null +++ b/src/hexagon/fastmath2_dlib_asm.s @@ -0,0 +1,266 @@ + .text + .global fast2_dadd_asm + .type fast2_dadd_asm, @function +fast2_dadd_asm: + .falign + { + R7:6 = VABSDIFFH(R1:0, R3:2) + R9 = #62 + R4 = SXTH(R0) + R5 = SXTH(R2) + } { + R6 = SXTH(R6) + P0 = CMP.GT(R4, R5); + if ( P0.new) R8 = add(R4, #1) + if (!P0.new) R8 = add(R5, #1) + } { + if ( P0) R4 = #1 + if (!P0) R5 = #1 + R0.L = #0 + R6 = MIN(R6, R9) + } { + if (!P0) R4 = add(R6, #1) + if ( P0) R5 = add(R6, #1) + R2.L = #0 + R11:10 = #0 + } { + R1:0 = ASR(R1:0, R4) + R3:2 = ASR(R3:2, R5) + } { + R1:0 = add(R1:0, R3:2) + R10.L = #0x8001 + } { + R4 = clb(R1:0) + R9 = #58 + } { + R4 = add(R4, #-1) + p0 = cmp.gt(R4, R9) + } { + R1:0 = ASL(R1:0, R4) + R8 = SUB(R8, R4) + if(p0) jump .Ldenorma + } { + R0 = insert(R8, #16, #0) + jumpr r31 + } +.Ldenorma: + { + R1:0 = R11:10 + jumpr r31 + } + .text + .global fast2_dsub_asm + .type fast2_dsub_asm, @function +fast2_dsub_asm: + .falign + { + R7:6 = VABSDIFFH(R1:0, R3:2) + R9 = #62 + R4 = SXTH(R0) + R5 = SXTH(R2) + } { + R6 = SXTH(R6) + P0 = CMP.GT(R4, R5); + if ( P0.new) R8 = add(R4, #1) + if (!P0.new) R8 = add(R5, #1) + } { + if ( P0) R4 = #1 + if (!P0) R5 = #1 + R0.L = #0 + R6 = MIN(R6, R9) + } { + if (!P0) R4 = add(R6, #1) + if ( P0) R5 = add(R6, #1) + R2.L = #0 + R11:10 = #0 + } { + R1:0 = ASR(R1:0, R4) + R3:2 = ASR(R3:2, R5) + } { + R1:0 = sub(R1:0, R3:2) + R10.L = #0x8001 + } { + R4 = clb(R1:0) + R9 = #58 + } { + R4 = add(R4, #-1) + p0 = cmp.gt(R4, R9) + } { + R1:0 = ASL(R1:0, R4) + R8 = SUB(R8, R4) + if(p0) jump .Ldenorm + } { + R0 = insert(R8, #16, #0) + jumpr r31 + } +.Ldenorm: + { + R1:0 = R11:10 + jumpr r31 + } + .text + .global fast2_dmpy_asm + .type fast2_dmpy_asm, @function +fast2_dmpy_asm: + .falign + { + R13= lsr(R2, #16) + R5 = sxth(R2) + R4 = sxth(R0) + R12= lsr(R0, #16) + } + { + R11:10 = mpy(R1, R3) + R7:6 = mpy(R1, R13) + R0.L = #0x0 + R15:14 = #0 + } + { + R11:10 = add(R11:10, R11:10) + R7:6 += mpy(R3, R12) + R2.L = #0x0 + R15.H = #0x8000 + } + { + R7:6 = asr(R7:6, #15) + R12.L = #0x8001 + p1 = cmp.eq(R1:0, R3:2) + } + { + R7:6 = add(R7:6, R11:10) + R8 = add(R4, R5) + p2 = cmp.eq(R1:0, R15:14) + } + { + R9 = clb(R7:6) + R3:2 = abs(R7:6) + R11 = #58 + } + { + p1 = and(p1, p2) + R8 = sub(R8, R9) + R9 = add(R9, #-1) + p0 = cmp.gt(R9, R11) + } + { + R8 = add(R8, #1) + R1:0 = asl(R7:6, R9) + if(p1) jump .Lsat + } + { + R0 = insert(R8,#16, #0) + if(!p0) jumpr r31 + } + { + R0 = insert(R12,#16, #0) + jumpr r31 + } +.Lsat: + { + R1:0 = #-1 + } + { + R1:0 = lsr(R1:0, #1) + } + { + R0 = insert(R8,#16, #0) + jumpr r31 + } + .text + .global fast2_qd2f_asm + .type fast2_qd2f_asm, @function +fast2_qd2f_asm: + .falign + { + R3 = abs(R1):sat + R4 = sxth(R0) + R5 = #0x40 + R6.L = #0xffc0 + } + { + R0 = extractu(R3, #8, #0) + p2 = cmp.gt(R4, #126) + p3 = cmp.ge(R4, #-126) + R6.H = #0x7fff + } + { + p1 = cmp.eq(R0,#0x40) + if(p1.new) R5 = #0 + R4 = add(R4, #126) + if(!p3) jump .Lmin + } + { + p0 = bitsset(R3, R6) + R0.L = #0x0000 + R2 = add(R3, R5) + R7 = lsr(R6, #8) + } + { + if(p0) R4 = add(R4, #1) + if(p0) R3 = #0 + R2 = lsr(R2, #7) + R0.H = #0x8000 + } + { + R0 = and(R0, R1) + R6 &= asl(R4, #23) + if(!p0) R3 = and(R2, R7) + if(p2) jump .Lmax + } + { + R0 += add(R6, R3) + jumpr r31 + } +.Lmax: + { + R0.L = #0xffff; + } + { + R0.H = #0x7f7f; + jumpr r31 + } +.Lmin: + { + R0 = #0x0 + jumpr r31 + } + .text + .global fast2_f2qd_asm + .type fast2_f2qd_asm, @function +fast2_f2qd_asm: + + + + + + + + .falign + { + R1 = asl(R0, #7) + p0 = tstbit(R0, #31) + R5:4 = #0 + R3 = add(R0,R0) + } + { + R1 = setbit(R1, #30) + R0= extractu(R0,#8,#23) + R4.L = #0x8001 + p1 = cmp.eq(R3, #0) + } + { + R1= extractu(R1, #31, #0) + R0= add(R0, #-126) + R2 = #0 + if(p1) jump .Lminqd + } + { + R0 = zxth(R0) + if(p0) R1= sub(R2, R1) + jumpr r31 + } +.Lminqd: + { + R1:0 = R5:4 + jumpr r31 + } diff --git a/src/hexagon/fastmath2_ldlib_asm.s b/src/hexagon/fastmath2_ldlib_asm.s new file mode 100644 index 00000000..b72b7550 --- /dev/null +++ b/src/hexagon/fastmath2_ldlib_asm.s @@ -0,0 +1,187 @@ + .text + .global fast2_ldadd_asm + .type fast2_ldadd_asm, @function +fast2_ldadd_asm: + .falign + { + R4 = memw(r29+#8) + R5 = memw(r29+#24) + r7 = r0 + } + { + R6 = sub(R4, R5):sat + P0 = CMP.GT(R4, R5); + if ( P0.new) R8 = add(R4, #1) + if (!P0.new) R8 = add(R5, #1) + } { + R6 = abs(R6):sat + if ( P0) R4 = #1 + if (!P0) R5 = #1 + R9 = #62 + } { + R6 = MIN(R6, R9) + R1:0 = memd(r29+#0) + R3:2 = memd(r29+#16) + } { + if (!P0) R4 = add(R6, #1) + if ( P0) R5 = add(R6, #1) + } { + R1:0 = ASR(R1:0, R4) + R3:2 = ASR(R3:2, R5) + } { + R1:0 = add(R1:0, R3:2) + R3:2 = #0 + } { + R4 = clb(R1:0) + R9.L =#0x0001 + } { + R8 -= add(R4, #-1) + R4 = add(R4, #-1) + p0 = cmp.gt(R4, #58) + R9.H =#0x8000 + } { + if(!p0)memw(r7+#8) = R8 + R1:0 = ASL(R1:0, R4) + if(p0) jump .Ldenorma1 + } { + memd(r7+#0) = R1:0 + jumpr r31 + } +.Ldenorma1: + memd(r7+#0) = R3:2 + { + memw(r7+#8) = R9 + jumpr r31 + } + .text + .global fast2_ldsub_asm + .type fast2_ldsub_asm, @function +fast2_ldsub_asm: + .falign + { + R4 = memw(r29+#8) + R5 = memw(r29+#24) + r7 = r0 + } + { + R6 = sub(R4, R5):sat + P0 = CMP.GT(R4, R5); + if ( P0.new) R8 = add(R4, #1) + if (!P0.new) R8 = add(R5, #1) + } { + R6 = abs(R6):sat + if ( P0) R4 = #1 + if (!P0) R5 = #1 + R9 = #62 + } { + R6 = min(R6, R9) + R1:0 = memd(r29+#0) + R3:2 = memd(r29+#16) + } { + if (!P0) R4 = add(R6, #1) + if ( P0) R5 = add(R6, #1) + } { + R1:0 = ASR(R1:0, R4) + R3:2 = ASR(R3:2, R5) + } { + R1:0 = sub(R1:0, R3:2) + R3:2 = #0 + } { + R4 = clb(R1:0) + R9.L =#0x0001 + } { + R8 -= add(R4, #-1) + R4 = add(R4, #-1) + p0 = cmp.gt(R4, #58) + R9.H =#0x8000 + } { + if(!p0)memw(r7+#8) = R8 + R1:0 = asl(R1:0, R4) + if(p0) jump .Ldenorma_s + } { + memd(r7+#0) = R1:0 + jumpr r31 + } +.Ldenorma_s: + memd(r7+#0) = R3:2 + { + memw(r7+#8) = R9 + jumpr r31 + } + .text + .global fast2_ldmpy_asm + .type fast2_ldmpy_asm, @function +fast2_ldmpy_asm: + .falign + { + R15:14 = memd(r29+#0) + R3:2 = memd(r29+#16) + R13:12 = #0 + } + { + R8= extractu(R2, #31, #1) + R9= extractu(R14, #31, #1) + R13.H = #0x8000 + } + { + R11:10 = mpy(R15, R3) + R7:6 = mpy(R15, R8) + R4 = memw(r29+#8) + R5 = memw(r29+#24) + } + { + R11:10 = add(R11:10, R11:10) + R7:6 += mpy(R3, R9) + } + { + R7:6 = asr(R7:6, #30) + R8.L = #0x0001 + p1 = cmp.eq(R15:14, R3:2) + } + { + R7:6 = add(R7:6, R11:10) + R4= add(R4, R5) + p2 = cmp.eq(R3:2, R13:12) + } + { + R9 = clb(R7:6) + R8.H = #0x8000 + p1 = and(p1, p2) + } + { + R4-= add(R9, #-1) + R9 = add(R9, #-1) + if(p1) jump .Lsat1 + } + { + R7:6 = asl(R7:6, R9) + memw(R0+#8) = R4 + p0 = cmp.gt(R9, #58) + if(p0.new) jump:NT .Ldenorm1 + } + { + memd(R0+#0) = R7:6 + jumpr r31 + } +.Lsat1: + { + R13:12 = #0 + R4+= add(R9, #1) + } + { + R13.H = #0x4000 + memw(R0+#8) = R4 + } + { + memd(R0+#0) = R13:12 + jumpr r31 + } +.Ldenorm1: + { + memw(R0+#8) = R8 + R15:14 = #0 + } + { + memd(R0+#0) = R15:14 + jumpr r31 + } diff --git a/src/hexagon/func_macro.s b/src/hexagon/func_macro.s new file mode 100644 index 00000000..9a1e11ae --- /dev/null +++ b/src/hexagon/func_macro.s @@ -0,0 +1,12 @@ + .macro FUNCTION_BEGIN name + .text + .p2align 5 + .globl \name + .type \name, @function +\name: + .endm + + .macro FUNCTION_END name + .size \name, . - \name + .endm + diff --git a/src/hexagon/memcpy_forward_vp4cp4n2.s b/src/hexagon/memcpy_forward_vp4cp4n2.s new file mode 100644 index 00000000..89f69010 --- /dev/null +++ b/src/hexagon/memcpy_forward_vp4cp4n2.s @@ -0,0 +1,91 @@ + .text + + + + + + + .globl hexagon_memcpy_forward_vp4cp4n2 + .balign 32 + .type hexagon_memcpy_forward_vp4cp4n2,@function +hexagon_memcpy_forward_vp4cp4n2: + + + + + { + r3 = sub(##4096, r1) + r5 = lsr(r2, #3) + } + { + + + r3 = extractu(r3, #10, #2) + r4 = extractu(r3, #7, #5) + } + { + r3 = minu(r2, r3) + r4 = minu(r5, r4) + } + { + r4 = or(r4, ##2105344) + p0 = cmp.eq(r3, #0) + if (p0.new) jump:nt .Lskipprolog + } + l2fetch(r1, r4) + { + loop0(.Lprolog, r3) + r2 = sub(r2, r3) + } + .falign +.Lprolog: + { + r4 = memw(r1++#4) + memw(r0++#4) = r4.new + } :endloop0 +.Lskipprolog: + { + + r3 = lsr(r2, #10) + if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain + } + { + loop1(.Lout, r3) + r2 = extractu(r2, #10, #0) + r3 = ##2105472 + } + + .falign +.Lout: + + l2fetch(r1, r3) + loop0(.Lpage, #512) + .falign +.Lpage: + r5:4 = memd(r1++#8) + { + memw(r0++#8) = r4 + memw(r0+#4) = r5 + } :endloop0:endloop1 +.Lskipmain: + { + r3 = ##2105344 + r4 = lsr(r2, #3) + p0 = cmp.eq(r2, #0) + if (p0.new) jumpr:nt r31 + } + { + r3 = or(r3, r4) + loop0(.Lepilog, r2) + } + l2fetch(r1, r3) + .falign +.Lepilog: + { + r4 = memw(r1++#4) + memw(r0++#4) = r4.new + } :endloop0 + + jumpr r31 + +.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 diff --git a/src/hexagon/memcpy_likely_aligned.s b/src/hexagon/memcpy_likely_aligned.s new file mode 100644 index 00000000..7e9b62f6 --- /dev/null +++ b/src/hexagon/memcpy_likely_aligned.s @@ -0,0 +1,42 @@ + +FUNCTION_BEGIN __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes + { + p0 = bitsclr(r1,#7) + p0 = bitsclr(r0,#7) + if (p0.new) r5:4 = memd(r1) + r3 = #-3 + } + { + if (!p0) jump .Lmemcpy_call + if (p0) memd(r0++#8) = r5:4 + if (p0) r5:4 = memd(r1+#8) + r3 += lsr(r2,#3) + } + { + memd(r0++#8) = r5:4 + r5:4 = memd(r1+#16) + r1 = add(r1,#24) + loop0(1f,r3) + } + .falign +1: + { + memd(r0++#8) = r5:4 + r5:4 = memd(r1++#8) + }:endloop0 + { + memd(r0) = r5:4 + r0 -= add(r2,#-8) + jumpr r31 + } +FUNCTION_END __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes + +.Lmemcpy_call: + + jump memcpy@PLT + + + + + .globl __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes + .set __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes, __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes diff --git a/src/hexagon/moddi3.s b/src/hexagon/moddi3.s new file mode 100644 index 00000000..53ea6d52 --- /dev/null +++ b/src/hexagon/moddi3.s @@ -0,0 +1,63 @@ + + +FUNCTION_BEGIN __hexagon_moddi3 + { + p3 = tstbit(r1,#31) + } + { + r1:0 = abs(r1:0) + r3:2 = abs(r3:2) + } + { + r6 = cl0(r1:0) + r7 = cl0(r3:2) + r5:4 = r3:2 + r3:2 = r1:0 + } + { + r10 = sub(r7,r6) + r1:0 = #0 + r15:14 = #1 + } + { + r11 = add(r10,#1) + r13:12 = lsl(r5:4,r10) + r15:14 = lsl(r15:14,r10) + } + { + p0 = cmp.gtu(r5:4,r3:2) + loop0(1f,r11) + } + { + if (p0) jump .hexagon_moddi3_return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) + } + { + r7:6 = sub(r3:2, r13:12) + r9:8 = add(r1:0, r15:14) + } + { + r1:0 = vmux(p0, r1:0, r9:8) + r3:2 = vmux(p0, r3:2, r7:6) + } + { + r15:14 = lsr(r15:14, #1) + r13:12 = lsr(r13:12, #1) + }:endloop0 + +.hexagon_moddi3_return: + { + r1:0 = neg(r3:2) + } + { + r1:0 = vmux(p3,r1:0,r3:2) + jumpr r31 + } +FUNCTION_END __hexagon_moddi3 + + .globl __qdsp_moddi3 + .set __qdsp_moddi3, __hexagon_moddi3 diff --git a/src/hexagon/modsi3.s b/src/hexagon/modsi3.s new file mode 100644 index 00000000..c4ae7e59 --- /dev/null +++ b/src/hexagon/modsi3.s @@ -0,0 +1,44 @@ + + +FUNCTION_BEGIN __hexagon_modsi3 + { + p2 = cmp.ge(r0,#0) + r2 = abs(r0) + r1 = abs(r1) + } + { + r3 = cl0(r2) + r4 = cl0(r1) + p0 = cmp.gtu(r1,r2) + } + { + r3 = sub(r4,r3) + if (p0) jumpr r31 + } + { + p1 = cmp.eq(r3,#0) + loop0(1f,r3) + r0 = r2 + r2 = lsl(r1,r3) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + if (p2) jumpr r31 + } + { + r0 = neg(r0) + jumpr r31 + } +FUNCTION_END __hexagon_modsi3 + + .globl __qdsp_modsi3 + .set __qdsp_modsi3, __hexagon_modsi3 diff --git a/src/hexagon/sfdiv_opt.s b/src/hexagon/sfdiv_opt.s new file mode 100644 index 00000000..26c91f15 --- /dev/null +++ b/src/hexagon/sfdiv_opt.s @@ -0,0 +1,42 @@ + +FUNCTION_BEGIN __hexagon_divsf3 + { + r2,p0 = sfrecipa(r0,r1) + r4 = sffixupd(r0,r1) + r3 = ##0x3f800000 + } + { + r5 = sffixupn(r0,r1) + r3 -= sfmpy(r4,r2):lib + r6 = ##0x80000000 + r7 = r3 + } + { + r2 += sfmpy(r3,r2):lib + r3 = r7 + r6 = r5 + r0 = and(r6,r5) + } + { + r3 -= sfmpy(r4,r2):lib + r0 += sfmpy(r5,r2):lib + } + { + r2 += sfmpy(r3,r2):lib + r6 -= sfmpy(r0,r4):lib + } + { + r0 += sfmpy(r6,r2):lib + } + { + r5 -= sfmpy(r0,r4):lib + } + { + r0 += sfmpy(r5,r2,p0):scale + jumpr r31 + } +FUNCTION_END __hexagon_divsf3 + +.global __qdsp_divsf3 ; .set __qdsp_divsf3, __hexagon_divsf3 +.global __hexagon_fast_divsf3 ; .set __hexagon_fast_divsf3, __hexagon_divsf3 +.global __hexagon_fast2_divsf3 ; .set __hexagon_fast2_divsf3, __hexagon_divsf3 diff --git a/src/hexagon/sfsqrt_opt.s b/src/hexagon/sfsqrt_opt.s new file mode 100644 index 00000000..c90af179 --- /dev/null +++ b/src/hexagon/sfsqrt_opt.s @@ -0,0 +1,49 @@ +FUNCTION_BEGIN __hexagon_sqrtf + { + r3,p0 = sfinvsqrta(r0) + r5 = sffixupr(r0) + r4 = ##0x3f000000 + r1:0 = combine(#0,#0) + } + { + r0 += sfmpy(r3,r5):lib + r1 += sfmpy(r3,r4):lib + r2 = r4 + r3 = r5 + } + { + r2 -= sfmpy(r0,r1):lib + p1 = sfclass(r5,#1) + + } + { + r0 += sfmpy(r0,r2):lib + r1 += sfmpy(r1,r2):lib + r2 = r4 + r3 = r5 + } + { + r2 -= sfmpy(r0,r1):lib + r3 -= sfmpy(r0,r0):lib + } + { + r0 += sfmpy(r1,r3):lib + r1 += sfmpy(r1,r2):lib + r2 = r4 + r3 = r5 + } + { + + r3 -= sfmpy(r0,r0):lib + if (p1) r0 = or(r0,r5) + } + { + r0 += sfmpy(r1,r3,p0):scale + jumpr r31 + } + +FUNCTION_END __hexagon_sqrtf + +.global __qdsp_sqrtf ; .set __qdsp_sqrtf, __hexagon_sqrtf +.global __hexagon_fast_sqrtf ; .set __hexagon_fast_sqrtf, __hexagon_sqrtf +.global __hexagon_fast2_sqrtf ; .set __hexagon_fast2_sqrtf, __hexagon_sqrtf diff --git a/src/hexagon/udivdi3.s b/src/hexagon/udivdi3.s new file mode 100644 index 00000000..f0fffc23 --- /dev/null +++ b/src/hexagon/udivdi3.s @@ -0,0 +1,50 @@ + + +FUNCTION_BEGIN __hexagon_udivdi3 + { + r6 = cl0(r1:0) + r7 = cl0(r3:2) + r5:4 = r3:2 + r3:2 = r1:0 + } + { + r10 = sub(r7,r6) + r1:0 = #0 + r15:14 = #1 + } + { + r11 = add(r10,#1) + r13:12 = lsl(r5:4,r10) + r15:14 = lsl(r15:14,r10) + } + { + p0 = cmp.gtu(r5:4,r3:2) + loop0(1f,r11) + } + { + if (p0) jumpr r31 + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) + } + { + r7:6 = sub(r3:2, r13:12) + r9:8 = add(r1:0, r15:14) + } + { + r1:0 = vmux(p0, r1:0, r9:8) + r3:2 = vmux(p0, r3:2, r7:6) + } + { + r15:14 = lsr(r15:14, #1) + r13:12 = lsr(r13:12, #1) + }:endloop0 + { + jumpr r31 + } +FUNCTION_END __hexagon_udivdi3 + + .globl __qdsp_udivdi3 + .set __qdsp_udivdi3, __hexagon_udivdi3 diff --git a/src/hexagon/udivmoddi4.s b/src/hexagon/udivmoddi4.s new file mode 100644 index 00000000..cbfb3987 --- /dev/null +++ b/src/hexagon/udivmoddi4.s @@ -0,0 +1,50 @@ + + +FUNCTION_BEGIN __hexagon_udivmoddi4 + { + r6 = cl0(r1:0) + r7 = cl0(r3:2) + r5:4 = r3:2 + r3:2 = r1:0 + } + { + r10 = sub(r7,r6) + r1:0 = #0 + r15:14 = #1 + } + { + r11 = add(r10,#1) + r13:12 = lsl(r5:4,r10) + r15:14 = lsl(r15:14,r10) + } + { + p0 = cmp.gtu(r5:4,r3:2) + loop0(1f,r11) + } + { + if (p0) jumpr r31 + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) + } + { + r7:6 = sub(r3:2, r13:12) + r9:8 = add(r1:0, r15:14) + } + { + r1:0 = vmux(p0, r1:0, r9:8) + r3:2 = vmux(p0, r3:2, r7:6) + } + { + r15:14 = lsr(r15:14, #1) + r13:12 = lsr(r13:12, #1) + }:endloop0 + { + jumpr r31 + } +FUNCTION_END __hexagon_udivmoddi4 + + .globl __qdsp_udivmoddi4 + .set __qdsp_udivmoddi4, __hexagon_udivmoddi4 diff --git a/src/hexagon/udivmodsi4.s b/src/hexagon/udivmodsi4.s new file mode 100644 index 00000000..83489c51 --- /dev/null +++ b/src/hexagon/udivmodsi4.s @@ -0,0 +1,39 @@ + + +FUNCTION_BEGIN __hexagon_udivmodsi4 + { + r2 = cl0(r0) + r3 = cl0(r1) + r5:4 = combine(#1,#0) + p0 = cmp.gtu(r1,r0) + } + { + r6 = sub(r3,r2) + r4 = r1 + r1:0 = combine(r0,r4) + if (p0) jumpr r31 + } + { + r3:2 = vlslw(r5:4,r6) + loop0(1f,r6) + p0 = cmp.eq(r6,#0) + if (p0.new) r4 = #0 + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r4) + if (!p0.new) r0 = add(r0,r3) + jumpr r31 + } +FUNCTION_END __hexagon_udivmodsi4 + + .globl __qdsp_udivmodsi4 + .set __qdsp_udivmodsi4, __hexagon_udivmodsi4 diff --git a/src/hexagon/udivsi3.s b/src/hexagon/udivsi3.s new file mode 100644 index 00000000..e0b94aa9 --- /dev/null +++ b/src/hexagon/udivsi3.s @@ -0,0 +1,36 @@ + + +FUNCTION_BEGIN __hexagon_udivsi3 + { + r2 = cl0(r0) + r3 = cl0(r1) + r5:4 = combine(#1,#0) + p0 = cmp.gtu(r1,r0) + } + { + r6 = sub(r3,r2) + r4 = r1 + r1:0 = combine(r0,r4) + if (p0) jumpr r31 + } + { + r3:2 = vlslw(r5:4,r6) + loop0(1f,r6) + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r0 = add(r0,r3) + jumpr r31 + } +FUNCTION_END __hexagon_udivsi3 + + .globl __qdsp_udivsi3 + .set __qdsp_udivsi3, __hexagon_udivsi3 diff --git a/src/hexagon/umoddi3.s b/src/hexagon/umoddi3.s new file mode 100644 index 00000000..c76011c3 --- /dev/null +++ b/src/hexagon/umoddi3.s @@ -0,0 +1,53 @@ + + +FUNCTION_BEGIN __hexagon_umoddi3 + { + r6 = cl0(r1:0) + r7 = cl0(r3:2) + r5:4 = r3:2 + r3:2 = r1:0 + } + { + r10 = sub(r7,r6) + r1:0 = #0 + r15:14 = #1 + } + { + r11 = add(r10,#1) + r13:12 = lsl(r5:4,r10) + r15:14 = lsl(r15:14,r10) + } + { + p0 = cmp.gtu(r5:4,r3:2) + loop0(1f,r11) + } + { + if (p0) jump .hexagon_umoddi3_return + } + .falign +1: + { + p0 = cmp.gtu(r13:12,r3:2) + } + { + r7:6 = sub(r3:2, r13:12) + r9:8 = add(r1:0, r15:14) + } + { + r1:0 = vmux(p0, r1:0, r9:8) + r3:2 = vmux(p0, r3:2, r7:6) + } + { + r15:14 = lsr(r15:14, #1) + r13:12 = lsr(r13:12, #1) + }:endloop0 + +.hexagon_umoddi3_return: + { + r1:0 = r3:2 + jumpr r31 + } +FUNCTION_END __hexagon_umoddi3 + + .globl __qdsp_umoddi3 + .set __qdsp_umoddi3, __hexagon_umoddi3 diff --git a/src/hexagon/umodsi3.s b/src/hexagon/umodsi3.s new file mode 100644 index 00000000..1b592a7c --- /dev/null +++ b/src/hexagon/umodsi3.s @@ -0,0 +1,34 @@ + + +FUNCTION_BEGIN __hexagon_umodsi3 + { + r2 = cl0(r0) + r3 = cl0(r1) + p0 = cmp.gtu(r1,r0) + } + { + r2 = sub(r3,r2) + if (p0) jumpr r31 + } + { + loop0(1f,r2) + p1 = cmp.eq(r2,#0) + r2 = lsl(r1,r2) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + jumpr r31 + } +FUNCTION_END __hexagon_umodsi3 + + .globl __qdsp_umodsi3 + .set __qdsp_umodsi3, __hexagon_umodsi3 diff --git a/src/lib.rs b/src/lib.rs index 3e549187..47aef540 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ #![cfg_attr(feature = "compiler-builtins", compiler_builtins)] #![cfg_attr(not(feature = "no-asm"), feature(asm))] #![feature(abi_unadjusted)] +#![feature(asm_experimental_arch)] #![cfg_attr(not(feature = "no-asm"), feature(global_asm))] #![feature(cfg_target_has_atomic)] #![feature(compiler_builtins)] @@ -70,6 +71,9 @@ pub mod aarch64_linux; ))] pub mod arm_linux; +#[cfg(target_arch = "hexagon")] +pub mod hexagon; + #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] pub mod riscv;