diff --git a/sys/crypto/openssl/aarch64/armv8-mont.S b/sys/crypto/openssl/aarch64/armv8-mont.S index 99f3a3d07ff94..4bdba95f80f8c 100644 --- a/sys/crypto/openssl/aarch64/armv8-mont.S +++ b/sys/crypto/openssl/aarch64/armv8-mont.S @@ -1,14 +1,32 @@ /* Do not modify. This file is auto-generated from armv8-mont.pl. */ +#ifndef __KERNEL__ +# include "arm_arch.h" + +.hidden OPENSSL_armv8_rsa_neonized +#endif .text .globl bn_mul_mont .type bn_mul_mont,%function .align 5 bn_mul_mont: +.Lbn_mul_mont: + tst x5,#3 + b.ne .Lmul_mont + cmp x5,#32 + b.le .Lscalar_impl +#ifndef __KERNEL__ + adrp x17,OPENSSL_armv8_rsa_neonized + ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] + cbnz w17, bn_mul8x_mont_neon +#endif + +.Lscalar_impl: tst x5,#7 b.eq __bn_sqr8x_mont tst x5,#3 b.eq __bn_mul4x_mont + .Lmul_mont: stp x29,x30,[sp,#-64]! add x29,sp,#0 @@ -132,7 +150,7 @@ bn_mul_mont: mul x16,x14,x15 // np[j]*m1 adds x12,x12,x6 umulh x17,x14,x15 - str x12,[x22,#-16] // tp[j-1] + stur x12,[x22,#-16] // tp[j-1] cbnz x21,.Linner .Linner_skip: @@ -188,13 +206,13 @@ bn_mul_mont: csel x14,x23,x8,lo // did it borrow? ldr x23,[x22],#8 ldr x8,[x0],#8 - str xzr,[x22,#-16] // wipe tp - str x14,[x0,#-16] + stur xzr,[x22,#-16] // wipe tp + stur x14,[x0,#-16] cbnz x5,.Lcond_copy csel x14,x23,x8,lo - str xzr,[x22,#-8] // wipe tp - str x14,[x0,#-8] + stur xzr,[x22,#-8] // wipe tp + stur x14,[x0,#-8] ldp x19,x20,[x29,#16] mov sp,x29 @@ -204,6 +222,704 @@ bn_mul_mont: ldr x29,[sp],#64 ret .size bn_mul_mont,.-bn_mul_mont +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + stp x29,x30,[sp,#-80]! + mov x16,sp + stp d8,d9,[sp,#16] + stp d10,d11,[sp,#32] + stp d12,d13,[sp,#48] + stp d14,d15,[sp,#64] + lsl x5,x5,#1 + eor v14.16b,v14.16b,v14.16b + +.align 4 +.LNEON_8n: + eor v6.16b,v6.16b,v6.16b + sub x7,sp,#128 + eor v7.16b,v7.16b,v7.16b + sub x7,x7,x5,lsl#4 + eor v8.16b,v8.16b,v8.16b + and x7,x7,#-64 + eor v9.16b,v9.16b,v9.16b + mov sp,x7 // alloca + eor v10.16b,v10.16b,v10.16b + add x7,x7,#256 + eor v11.16b,v11.16b,v11.16b + sub x8,x5,#8 + eor v12.16b,v12.16b,v12.16b + eor v13.16b,v13.16b,v13.16b + +.LNEON_8n_init: + st1 {v6.2d,v7.2d},[x7],#32 + subs x8,x8,#8 + st1 {v8.2d,v9.2d},[x7],#32 + st1 {v10.2d,v11.2d},[x7],#32 + st1 {v12.2d,v13.2d},[x7],#32 + bne .LNEON_8n_init + + add x6,sp,#256 + ld1 {v0.4s,v1.4s},[x1],#32 + add x10,sp,#8 + ldr s30,[x4],#4 + mov x9,x5 + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + ldr s28,[x2],#4 // *b++ + uxtl v28.4s,v28.4h + add x7,sp,#128 + ld1 {v2.4s,v3.4s},[x3],#32 + + umlal v6.2d,v28.2s,v0.s[0] + umlal v7.2d,v28.2s,v0.s[1] + umlal v8.2d,v28.2s,v0.s[2] + shl v29.2d,v6.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v9.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v6.2d + umlal v10.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v11.2d,v28.2s,v1.s[1] + st1 {v28.2s},[sp] // put aside smashed b[8*i+0] + umlal v12.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v13.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v6.2d,v29.2s,v2.s[0] + umlal v7.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v8.2d,v29.2s,v2.s[2] + ushr v15.2d,v6.2d,#16 + umlal v9.2d,v29.2s,v2.s[3] + umlal v10.2d,v29.2s,v3.s[0] + ext v6.16b,v6.16b,v6.16b,#8 + add v6.2d,v6.2d,v15.2d + umlal v11.2d,v29.2s,v3.s[1] + ushr v6.2d,v6.2d,#16 + umlal v12.2d,v29.2s,v3.s[2] + umlal v13.2d,v29.2s,v3.s[3] + add v16.2d,v7.2d,v6.2d + ins v7.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] + umlal v7.2d,v28.2s,v0.s[0] + ld1 {v6.2d},[x6],#16 + umlal v8.2d,v28.2s,v0.s[1] + umlal v9.2d,v28.2s,v0.s[2] + shl v29.2d,v7.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v10.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v7.2d + umlal v11.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v12.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] + umlal v13.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v6.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v7.2d,v29.2s,v2.s[0] + umlal v8.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v9.2d,v29.2s,v2.s[2] + ushr v15.2d,v7.2d,#16 + umlal v10.2d,v29.2s,v2.s[3] + umlal v11.2d,v29.2s,v3.s[0] + ext v7.16b,v7.16b,v7.16b,#8 + add v7.2d,v7.2d,v15.2d + umlal v12.2d,v29.2s,v3.s[1] + ushr v7.2d,v7.2d,#16 + umlal v13.2d,v29.2s,v3.s[2] + umlal v6.2d,v29.2s,v3.s[3] + add v16.2d,v8.2d,v7.2d + ins v8.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] + umlal v8.2d,v28.2s,v0.s[0] + ld1 {v7.2d},[x6],#16 + umlal v9.2d,v28.2s,v0.s[1] + umlal v10.2d,v28.2s,v0.s[2] + shl v29.2d,v8.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v11.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v8.2d + umlal v12.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v13.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] + umlal v6.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v7.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v8.2d,v29.2s,v2.s[0] + umlal v9.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v10.2d,v29.2s,v2.s[2] + ushr v15.2d,v8.2d,#16 + umlal v11.2d,v29.2s,v2.s[3] + umlal v12.2d,v29.2s,v3.s[0] + ext v8.16b,v8.16b,v8.16b,#8 + add v8.2d,v8.2d,v15.2d + umlal v13.2d,v29.2s,v3.s[1] + ushr v8.2d,v8.2d,#16 + umlal v6.2d,v29.2s,v3.s[2] + umlal v7.2d,v29.2s,v3.s[3] + add v16.2d,v9.2d,v8.2d + ins v9.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] + umlal v9.2d,v28.2s,v0.s[0] + ld1 {v8.2d},[x6],#16 + umlal v10.2d,v28.2s,v0.s[1] + umlal v11.2d,v28.2s,v0.s[2] + shl v29.2d,v9.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v12.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v9.2d + umlal v13.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v6.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] + umlal v7.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v8.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v9.2d,v29.2s,v2.s[0] + umlal v10.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v11.2d,v29.2s,v2.s[2] + ushr v15.2d,v9.2d,#16 + umlal v12.2d,v29.2s,v2.s[3] + umlal v13.2d,v29.2s,v3.s[0] + ext v9.16b,v9.16b,v9.16b,#8 + add v9.2d,v9.2d,v15.2d + umlal v6.2d,v29.2s,v3.s[1] + ushr v9.2d,v9.2d,#16 + umlal v7.2d,v29.2s,v3.s[2] + umlal v8.2d,v29.2s,v3.s[3] + add v16.2d,v10.2d,v9.2d + ins v10.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] + umlal v10.2d,v28.2s,v0.s[0] + ld1 {v9.2d},[x6],#16 + umlal v11.2d,v28.2s,v0.s[1] + umlal v12.2d,v28.2s,v0.s[2] + shl v29.2d,v10.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v13.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v10.2d + umlal v6.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v7.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] + umlal v8.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v9.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v10.2d,v29.2s,v2.s[0] + umlal v11.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v12.2d,v29.2s,v2.s[2] + ushr v15.2d,v10.2d,#16 + umlal v13.2d,v29.2s,v2.s[3] + umlal v6.2d,v29.2s,v3.s[0] + ext v10.16b,v10.16b,v10.16b,#8 + add v10.2d,v10.2d,v15.2d + umlal v7.2d,v29.2s,v3.s[1] + ushr v10.2d,v10.2d,#16 + umlal v8.2d,v29.2s,v3.s[2] + umlal v9.2d,v29.2s,v3.s[3] + add v16.2d,v11.2d,v10.2d + ins v11.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] + umlal v11.2d,v28.2s,v0.s[0] + ld1 {v10.2d},[x6],#16 + umlal v12.2d,v28.2s,v0.s[1] + umlal v13.2d,v28.2s,v0.s[2] + shl v29.2d,v11.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v6.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v11.2d + umlal v7.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v8.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] + umlal v9.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v10.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v11.2d,v29.2s,v2.s[0] + umlal v12.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v13.2d,v29.2s,v2.s[2] + ushr v15.2d,v11.2d,#16 + umlal v6.2d,v29.2s,v2.s[3] + umlal v7.2d,v29.2s,v3.s[0] + ext v11.16b,v11.16b,v11.16b,#8 + add v11.2d,v11.2d,v15.2d + umlal v8.2d,v29.2s,v3.s[1] + ushr v11.2d,v11.2d,#16 + umlal v9.2d,v29.2s,v3.s[2] + umlal v10.2d,v29.2s,v3.s[3] + add v16.2d,v12.2d,v11.2d + ins v12.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] + umlal v12.2d,v28.2s,v0.s[0] + ld1 {v11.2d},[x6],#16 + umlal v13.2d,v28.2s,v0.s[1] + umlal v6.2d,v28.2s,v0.s[2] + shl v29.2d,v12.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v7.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v12.2d + umlal v8.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v9.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] + umlal v10.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v11.2d,v28.2s,v1.s[3] + ldr s28,[x2],#4 // *b++ + umlal v12.2d,v29.2s,v2.s[0] + umlal v13.2d,v29.2s,v2.s[1] + uxtl v28.4s,v28.4h + umlal v6.2d,v29.2s,v2.s[2] + ushr v15.2d,v12.2d,#16 + umlal v7.2d,v29.2s,v2.s[3] + umlal v8.2d,v29.2s,v3.s[0] + ext v12.16b,v12.16b,v12.16b,#8 + add v12.2d,v12.2d,v15.2d + umlal v9.2d,v29.2s,v3.s[1] + ushr v12.2d,v12.2d,#16 + umlal v10.2d,v29.2s,v3.s[2] + umlal v11.2d,v29.2s,v3.s[3] + add v16.2d,v13.2d,v12.2d + ins v13.d[0],v16.d[0] + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] + umlal v13.2d,v28.2s,v0.s[0] + ld1 {v12.2d},[x6],#16 + umlal v6.2d,v28.2s,v0.s[1] + umlal v7.2d,v28.2s,v0.s[2] + shl v29.2d,v13.2d,#16 + ext v29.16b,v29.16b,v29.16b,#8 + umlal v8.2d,v28.2s,v0.s[3] + add v29.2d,v29.2d,v13.2d + umlal v9.2d,v28.2s,v1.s[0] + mul v29.2s,v29.2s,v30.2s + umlal v10.2d,v28.2s,v1.s[1] + st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] + umlal v11.2d,v28.2s,v1.s[2] + uxtl v29.4s,v29.4h + umlal v12.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[sp] // pull smashed b[8*i+0] + umlal v13.2d,v29.2s,v2.s[0] + ld1 {v0.4s,v1.4s},[x1],#32 + umlal v6.2d,v29.2s,v2.s[1] + umlal v7.2d,v29.2s,v2.s[2] + mov v5.16b,v13.16b + ushr v5.2d,v5.2d,#16 + ext v13.16b,v13.16b,v13.16b,#8 + umlal v8.2d,v29.2s,v2.s[3] + umlal v9.2d,v29.2s,v3.s[0] + add v13.2d,v13.2d,v5.2d + umlal v10.2d,v29.2s,v3.s[1] + ushr v13.2d,v13.2d,#16 + eor v15.16b,v15.16b,v15.16b + ins v13.d[1],v15.d[0] + umlal v11.2d,v29.2s,v3.s[2] + umlal v12.2d,v29.2s,v3.s[3] + add v6.2d,v6.2d,v13.2d + st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] + add x10,sp,#8 // rewind + sub x8,x5,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs x8,x8,#8 + umlal v6.2d,v28.2s,v0.s[0] + ld1 {v13.2d},[x6] + umlal v7.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] + umlal v8.2d,v28.2s,v0.s[2] + ld1 {v2.4s,v3.4s},[x3],#32 + umlal v9.2d,v28.2s,v0.s[3] + b.eq .LInner_jump + add x6,x6,#16 // don't advance in last iteration +.LInner_jump: + umlal v10.2d,v28.2s,v1.s[0] + umlal v11.2d,v28.2s,v1.s[1] + umlal v12.2d,v28.2s,v1.s[2] + umlal v13.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] + umlal v6.2d,v29.2s,v2.s[0] + umlal v7.2d,v29.2s,v2.s[1] + umlal v8.2d,v29.2s,v2.s[2] + umlal v9.2d,v29.2s,v2.s[3] + umlal v10.2d,v29.2s,v3.s[0] + umlal v11.2d,v29.2s,v3.s[1] + umlal v12.2d,v29.2s,v3.s[2] + umlal v13.2d,v29.2s,v3.s[3] + st1 {v6.2d},[x7],#16 + umlal v7.2d,v28.2s,v0.s[0] + ld1 {v6.2d},[x6] + umlal v8.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] + umlal v9.2d,v28.2s,v0.s[2] + b.eq .LInner_jump1 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump1: + umlal v10.2d,v28.2s,v0.s[3] + umlal v11.2d,v28.2s,v1.s[0] + umlal v12.2d,v28.2s,v1.s[1] + umlal v13.2d,v28.2s,v1.s[2] + umlal v6.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] + umlal v7.2d,v29.2s,v2.s[0] + umlal v8.2d,v29.2s,v2.s[1] + umlal v9.2d,v29.2s,v2.s[2] + umlal v10.2d,v29.2s,v2.s[3] + umlal v11.2d,v29.2s,v3.s[0] + umlal v12.2d,v29.2s,v3.s[1] + umlal v13.2d,v29.2s,v3.s[2] + umlal v6.2d,v29.2s,v3.s[3] + st1 {v7.2d},[x7],#16 + umlal v8.2d,v28.2s,v0.s[0] + ld1 {v7.2d},[x6] + umlal v9.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] + umlal v10.2d,v28.2s,v0.s[2] + b.eq .LInner_jump2 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump2: + umlal v11.2d,v28.2s,v0.s[3] + umlal v12.2d,v28.2s,v1.s[0] + umlal v13.2d,v28.2s,v1.s[1] + umlal v6.2d,v28.2s,v1.s[2] + umlal v7.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] + umlal v8.2d,v29.2s,v2.s[0] + umlal v9.2d,v29.2s,v2.s[1] + umlal v10.2d,v29.2s,v2.s[2] + umlal v11.2d,v29.2s,v2.s[3] + umlal v12.2d,v29.2s,v3.s[0] + umlal v13.2d,v29.2s,v3.s[1] + umlal v6.2d,v29.2s,v3.s[2] + umlal v7.2d,v29.2s,v3.s[3] + st1 {v8.2d},[x7],#16 + umlal v9.2d,v28.2s,v0.s[0] + ld1 {v8.2d},[x6] + umlal v10.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] + umlal v11.2d,v28.2s,v0.s[2] + b.eq .LInner_jump3 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump3: + umlal v12.2d,v28.2s,v0.s[3] + umlal v13.2d,v28.2s,v1.s[0] + umlal v6.2d,v28.2s,v1.s[1] + umlal v7.2d,v28.2s,v1.s[2] + umlal v8.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] + umlal v9.2d,v29.2s,v2.s[0] + umlal v10.2d,v29.2s,v2.s[1] + umlal v11.2d,v29.2s,v2.s[2] + umlal v12.2d,v29.2s,v2.s[3] + umlal v13.2d,v29.2s,v3.s[0] + umlal v6.2d,v29.2s,v3.s[1] + umlal v7.2d,v29.2s,v3.s[2] + umlal v8.2d,v29.2s,v3.s[3] + st1 {v9.2d},[x7],#16 + umlal v10.2d,v28.2s,v0.s[0] + ld1 {v9.2d},[x6] + umlal v11.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] + umlal v12.2d,v28.2s,v0.s[2] + b.eq .LInner_jump4 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump4: + umlal v13.2d,v28.2s,v0.s[3] + umlal v6.2d,v28.2s,v1.s[0] + umlal v7.2d,v28.2s,v1.s[1] + umlal v8.2d,v28.2s,v1.s[2] + umlal v9.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] + umlal v10.2d,v29.2s,v2.s[0] + umlal v11.2d,v29.2s,v2.s[1] + umlal v12.2d,v29.2s,v2.s[2] + umlal v13.2d,v29.2s,v2.s[3] + umlal v6.2d,v29.2s,v3.s[0] + umlal v7.2d,v29.2s,v3.s[1] + umlal v8.2d,v29.2s,v3.s[2] + umlal v9.2d,v29.2s,v3.s[3] + st1 {v10.2d},[x7],#16 + umlal v11.2d,v28.2s,v0.s[0] + ld1 {v10.2d},[x6] + umlal v12.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] + umlal v13.2d,v28.2s,v0.s[2] + b.eq .LInner_jump5 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump5: + umlal v6.2d,v28.2s,v0.s[3] + umlal v7.2d,v28.2s,v1.s[0] + umlal v8.2d,v28.2s,v1.s[1] + umlal v9.2d,v28.2s,v1.s[2] + umlal v10.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] + umlal v11.2d,v29.2s,v2.s[0] + umlal v12.2d,v29.2s,v2.s[1] + umlal v13.2d,v29.2s,v2.s[2] + umlal v6.2d,v29.2s,v2.s[3] + umlal v7.2d,v29.2s,v3.s[0] + umlal v8.2d,v29.2s,v3.s[1] + umlal v9.2d,v29.2s,v3.s[2] + umlal v10.2d,v29.2s,v3.s[3] + st1 {v11.2d},[x7],#16 + umlal v12.2d,v28.2s,v0.s[0] + ld1 {v11.2d},[x6] + umlal v13.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] + umlal v6.2d,v28.2s,v0.s[2] + b.eq .LInner_jump6 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump6: + umlal v7.2d,v28.2s,v0.s[3] + umlal v8.2d,v28.2s,v1.s[0] + umlal v9.2d,v28.2s,v1.s[1] + umlal v10.2d,v28.2s,v1.s[2] + umlal v11.2d,v28.2s,v1.s[3] + ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] + umlal v12.2d,v29.2s,v2.s[0] + umlal v13.2d,v29.2s,v2.s[1] + umlal v6.2d,v29.2s,v2.s[2] + umlal v7.2d,v29.2s,v2.s[3] + umlal v8.2d,v29.2s,v3.s[0] + umlal v9.2d,v29.2s,v3.s[1] + umlal v10.2d,v29.2s,v3.s[2] + umlal v11.2d,v29.2s,v3.s[3] + st1 {v12.2d},[x7],#16 + umlal v13.2d,v28.2s,v0.s[0] + ld1 {v12.2d},[x6] + umlal v6.2d,v28.2s,v0.s[1] + ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] + umlal v7.2d,v28.2s,v0.s[2] + b.eq .LInner_jump7 + add x6,x6,#16 // don't advance in last iteration +.LInner_jump7: + umlal v8.2d,v28.2s,v0.s[3] + umlal v9.2d,v28.2s,v1.s[0] + umlal v10.2d,v28.2s,v1.s[1] + umlal v11.2d,v28.2s,v1.s[2] + umlal v12.2d,v28.2s,v1.s[3] + b.ne .LInner_after_rewind8 + sub x1,x1,x5,lsl#2 // rewind +.LInner_after_rewind8: + umlal v13.2d,v29.2s,v2.s[0] + ld1 {v28.2s},[sp] // pull smashed b[8*i+0] + umlal v6.2d,v29.2s,v2.s[1] + ld1 {v0.4s,v1.4s},[x1],#32 + umlal v7.2d,v29.2s,v2.s[2] + add x10,sp,#8 // rewind + umlal v8.2d,v29.2s,v2.s[3] + umlal v9.2d,v29.2s,v3.s[0] + umlal v10.2d,v29.2s,v3.s[1] + umlal v11.2d,v29.2s,v3.s[2] + st1 {v13.2d},[x7],#16 + umlal v12.2d,v29.2s,v3.s[3] + + bne .LNEON_8n_inner + add x6,sp,#128 + st1 {v6.2d,v7.2d},[x7],#32 + eor v2.16b,v2.16b,v2.16b // v2 + st1 {v8.2d,v9.2d},[x7],#32 + eor v3.16b,v3.16b,v3.16b // v3 + st1 {v10.2d,v11.2d},[x7],#32 + st1 {v12.2d},[x7] + + subs x9,x9,#8 + ld1 {v6.2d,v7.2d},[x6],#32 + ld1 {v8.2d,v9.2d},[x6],#32 + ld1 {v10.2d,v11.2d},[x6],#32 + ld1 {v12.2d,v13.2d},[x6],#32 + + b.eq .LInner_8n_jump_2steps + sub x3,x3,x5,lsl#2 // rewind + b .LNEON_8n_outer + +.LInner_8n_jump_2steps: + add x7,sp,#128 + st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame + mov v5.16b,v6.16b + ushr v15.2d,v6.2d,#16 + ext v6.16b,v6.16b,v6.16b,#8 + st1 {v2.2d,v3.2d}, [sp],#32 + add v6.2d,v6.2d,v15.2d + st1 {v2.2d,v3.2d}, [sp],#32 + ushr v15.2d,v6.2d,#16 + st1 {v2.2d,v3.2d}, [sp],#32 + zip1 v6.4h,v5.4h,v6.4h + ins v15.d[1],v14.d[0] + + mov x8,x5 + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + add v6.2d,v6.2d,v15.2d + mov v5.16b,v6.16b + ushr v15.2d,v6.2d,#16 + ext v6.16b,v6.16b,v6.16b,#8 + ld1 {v8.2d,v9.2d}, [x6],#32 + add v6.2d,v6.2d,v15.2d + ld1 {v10.2d,v11.2d}, [x6],#32 + ushr v15.2d,v6.2d,#16 + ld1 {v12.2d,v13.2d}, [x6],#32 + zip1 v6.4h,v5.4h,v6.4h + ins v15.d[1],v14.d[0] + +.LNEON_tail_entry: + add v7.2d,v7.2d,v15.2d + st1 {v6.s}[0], [x7],#4 + ushr v15.2d,v7.2d,#16 + mov v5.16b,v7.16b + ext v7.16b,v7.16b,v7.16b,#8 + add v7.2d,v7.2d,v15.2d + ushr v15.2d,v7.2d,#16 + zip1 v7.4h,v5.4h,v7.4h + ins v15.d[1],v14.d[0] + add v8.2d,v8.2d,v15.2d + st1 {v7.s}[0], [x7],#4 + ushr v15.2d,v8.2d,#16 + mov v5.16b,v8.16b + ext v8.16b,v8.16b,v8.16b,#8 + add v8.2d,v8.2d,v15.2d + ushr v15.2d,v8.2d,#16 + zip1 v8.4h,v5.4h,v8.4h + ins v15.d[1],v14.d[0] + add v9.2d,v9.2d,v15.2d + st1 {v8.s}[0], [x7],#4 + ushr v15.2d,v9.2d,#16 + mov v5.16b,v9.16b + ext v9.16b,v9.16b,v9.16b,#8 + add v9.2d,v9.2d,v15.2d + ushr v15.2d,v9.2d,#16 + zip1 v9.4h,v5.4h,v9.4h + ins v15.d[1],v14.d[0] + add v10.2d,v10.2d,v15.2d + st1 {v9.s}[0], [x7],#4 + ushr v15.2d,v10.2d,#16 + mov v5.16b,v10.16b + ext v10.16b,v10.16b,v10.16b,#8 + add v10.2d,v10.2d,v15.2d + ushr v15.2d,v10.2d,#16 + zip1 v10.4h,v5.4h,v10.4h + ins v15.d[1],v14.d[0] + add v11.2d,v11.2d,v15.2d + st1 {v10.s}[0], [x7],#4 + ushr v15.2d,v11.2d,#16 + mov v5.16b,v11.16b + ext v11.16b,v11.16b,v11.16b,#8 + add v11.2d,v11.2d,v15.2d + ushr v15.2d,v11.2d,#16 + zip1 v11.4h,v5.4h,v11.4h + ins v15.d[1],v14.d[0] + add v12.2d,v12.2d,v15.2d + st1 {v11.s}[0], [x7],#4 + ushr v15.2d,v12.2d,#16 + mov v5.16b,v12.16b + ext v12.16b,v12.16b,v12.16b,#8 + add v12.2d,v12.2d,v15.2d + ushr v15.2d,v12.2d,#16 + zip1 v12.4h,v5.4h,v12.4h + ins v15.d[1],v14.d[0] + add v13.2d,v13.2d,v15.2d + st1 {v12.s}[0], [x7],#4 + ushr v15.2d,v13.2d,#16 + mov v5.16b,v13.16b + ext v13.16b,v13.16b,v13.16b,#8 + add v13.2d,v13.2d,v15.2d + ushr v15.2d,v13.2d,#16 + zip1 v13.4h,v5.4h,v13.4h + ins v15.d[1],v14.d[0] + ld1 {v6.2d,v7.2d}, [x6],#32 + subs x8,x8,#8 + st1 {v13.s}[0], [x7],#4 + bne .LNEON_tail + + st1 {v15.s}[0], [x7],#4 // top-most bit + sub x3,x3,x5,lsl#2 // rewind x3 + subs x1,sp,#0 // clear carry flag + add x2,sp,x5,lsl#2 + +.LNEON_sub: + ldp w4,w5,[x1],#8 + ldp w6,w7,[x1],#8 + ldp w8,w9,[x3],#8 + ldp w10,w11,[x3],#8 + sbcs w8,w4,w8 + sbcs w9,w5,w9 + sbcs w10,w6,w10 + sbcs w11,w7,w11 + sub x17,x2,x1 + stp w8,w9,[x0],#8 + stp w10,w11,[x0],#8 + cbnz x17,.LNEON_sub + + ldr w10, [x1] // load top-most bit + mov x11,sp + eor v0.16b,v0.16b,v0.16b + sub x11,x2,x11 // this is num*4 + eor v1.16b,v1.16b,v1.16b + mov x1,sp + sub x0,x0,x11 // rewind x0 + mov x3,x2 // second 3/4th of frame + sbcs w10,w10,wzr // result is carry flag + +.LNEON_copy_n_zap: + ldp w4,w5,[x1],#8 + ldp w6,w7,[x1],#8 + ldp w8,w9,[x0],#8 + ldp w10,w11,[x0] + sub x0,x0,#8 + b.cs .LCopy_1 + mov w8,w4 + mov w9,w5 + mov w10,w6 + mov w11,w7 +.LCopy_1: + st1 {v0.2d,v1.2d}, [x3],#32 // wipe + st1 {v0.2d,v1.2d}, [x3],#32 // wipe + ldp w4,w5,[x1],#8 + ldp w6,w7,[x1],#8 + stp w8,w9,[x0],#8 + stp w10,w11,[x0],#8 + sub x1,x1,#32 + ldp w8,w9,[x0],#8 + ldp w10,w11,[x0] + sub x0,x0,#8 + b.cs .LCopy_2 + mov w8, w4 + mov w9, w5 + mov w10, w6 + mov w11, w7 +.LCopy_2: + st1 {v0.2d,v1.2d}, [x1],#32 // wipe + st1 {v0.2d,v1.2d}, [x3],#32 // wipe + sub x17,x2,x1 // preserves carry + stp w8,w9,[x0],#8 + stp w10,w11,[x0],#8 + cbnz x17,.LNEON_copy_n_zap + + mov sp,x16 + ldp d14,d15,[sp,#64] + ldp d12,d13,[sp,#48] + ldp d10,d11,[sp,#32] + ldp d8,d9,[sp,#16] + ldr x29,[sp],#80 + ret // bx lr + +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon .type __bn_sqr8x_mont,%function .align 5 __bn_sqr8x_mont: @@ -519,7 +1235,7 @@ __bn_sqr8x_mont: ldp x10,x11,[x2,#8*4] ldp x12,x13,[x2,#8*6] adds x19,x19,x6 - ldr x4,[x0,#-8*8] + ldur x4,[x0,#-8*8] adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] adcs x21,x21,x8 @@ -714,7 +1430,7 @@ __bn_sqr8x_mont: //adc x28,xzr,xzr // moved below cbz x27,.Lsqr8x8_post_condition - ldr x4,[x2,#-8*8] + ldur x4,[x2,#-8*8] ldp x6,x7,[x1,#8*0] ldp x8,x9,[x1,#8*2] ldp x10,x11,[x1,#8*4] @@ -772,7 +1488,7 @@ __bn_sqr8x_mont: ldp x12,x13,[x2,#8*6] cbz x27,.Lsqr8x_tail_break - ldr x4,[x0,#-8*8] + ldur x4,[x0,#-8*8] adds x19,x19,x6 adcs x20,x20,x7 ldp x6,x7,[x1,#8*0] diff --git a/sys/crypto/openssl/aarch64/chacha-armv8.S b/sys/crypto/openssl/aarch64/chacha-armv8.S index 24db5ad307319..609e34f422a26 100644 --- a/sys/crypto/openssl/aarch64/chacha-armv8.S +++ b/sys/crypto/openssl/aarch64/chacha-armv8.S @@ -1,23 +1,20 @@ /* Do not modify. This file is auto-generated from chacha-armv8.pl. */ -#include "arm_arch.h" - -.text - +#ifndef __KERNEL__ +# include "arm_arch.h" .hidden OPENSSL_armcap_P +#endif + +.text .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: -.long 1,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.long 1,2,3,4 +.Lrot24: +.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 .globl ChaCha20_ctr32 @@ -25,17 +22,15 @@ .align 5 ChaCha20_ctr32: cbz x2,.Labort - adr x5,.LOPENSSL_armcap_P cmp x2,#192 b.lo .Lshort -#ifdef __ILP32__ - ldrsw x6,[x5] -#else - ldr x6,[x5] -#endif - ldr w17,[x6,x5] + +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON - b.ne ChaCha20_neon + b.ne .LChaCha20_neon +#endif .Lshort: .inst 0xd503233f // paciasp @@ -54,7 +49,7 @@ ChaCha20_ctr32: ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ldp x28,x30,[x4] // load counter -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 @@ -215,7 +210,7 @@ ChaCha20_ctr32: add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -272,7 +267,7 @@ ChaCha20_ctr32: add x15,x15,x16,lsl#32 add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -311,9 +306,13 @@ ChaCha20_ctr32: ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 +#ifdef __KERNEL__ +.globl ChaCha20_neon +#endif .type ChaCha20_neon,%function .align 5 ChaCha20_neon: +.LChaCha20_neon: .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 @@ -330,15 +329,16 @@ ChaCha20_neon: sub sp,sp,#64 ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 + ld1 {v0.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] + ld1 {v1.4s,v2.4s},[x3] ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __ARMEB__ - rev64 v24.4s,v24.4s + ld1 {v3.4s},[x4] + stp d8,d9,[sp] // meet ABI requirements + ld1 {v8.4s,v9.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v0.4s,v0.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 @@ -346,296 +346,330 @@ ChaCha20_neon: ror x28,x28,#32 ror x30,x30,#32 #endif - add v27.4s,v27.4s,v31.4s // += 1 - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 .Loop_outer_neon: - mov w5,w22 // unpack key block + dup v16.4s,v0.s[0] // unpack key block + mov w5,w22 + dup v20.4s,v0.s[1] lsr x6,x22,#32 - mov v0.16b,v24.16b + dup v24.4s,v0.s[2] mov w7,w23 + dup v28.4s,v0.s[3] lsr x8,x23,#32 - mov v4.16b,v24.16b + dup v17.4s,v1.s[0] mov w9,w24 + dup v21.4s,v1.s[1] lsr x10,x24,#32 - mov v16.16b,v24.16b + dup v25.4s,v1.s[2] mov w11,w25 - mov v1.16b,v25.16b + dup v29.4s,v1.s[3] lsr x12,x25,#32 - mov v5.16b,v25.16b + dup v19.4s,v3.s[0] mov w13,w26 - mov v17.16b,v25.16b + dup v23.4s,v3.s[1] lsr x14,x26,#32 - mov v3.16b,v27.16b + dup v27.4s,v3.s[2] mov w15,w27 - mov v7.16b,v28.16b + dup v31.4s,v3.s[3] lsr x16,x27,#32 - mov v19.16b,v29.16b + add v19.4s,v19.4s,v8.4s mov w17,w28 - mov v2.16b,v26.16b + dup v18.4s,v2.s[0] lsr x19,x28,#32 - mov v6.16b,v26.16b + dup v22.4s,v2.s[1] mov w20,w30 - mov v18.16b,v26.16b + dup v26.4s,v2.s[2] lsr x21,x30,#32 + dup v30.4s,v2.s[3] mov x4,#10 - subs x2,x2,#256 + subs x2,x2,#320 .Loop_neon: sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s + add v16.4s,v16.4s,v17.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v20.4s,v20.4s,v21.4s add w6,w6,w10 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s add w7,w7,w11 - eor v3.16b,v3.16b,v0.16b + add v28.4s,v28.4s,v29.4s add w8,w8,w12 - eor v7.16b,v7.16b,v4.16b - eor w17,w17,w5 eor v19.16b,v19.16b,v16.16b + eor w17,w17,w5 + eor v23.16b,v23.16b,v20.16b eor w19,w19,w6 - rev32 v3.8h,v3.8h + eor v27.16b,v27.16b,v24.16b eor w20,w20,w7 - rev32 v7.8h,v7.8h + eor v31.16b,v31.16b,v28.16b eor w21,w21,w8 rev32 v19.8h,v19.8h ror w17,w17,#16 - add v2.4s,v2.4s,v3.4s + rev32 v23.8h,v23.8h ror w19,w19,#16 - add v6.4s,v6.4s,v7.4s + rev32 v27.8h,v27.8h ror w20,w20,#16 - add v18.4s,v18.4s,v19.4s + rev32 v31.8h,v31.8h ror w21,w21,#16 - eor v20.16b,v1.16b,v2.16b + add v18.4s,v18.4s,v19.4s add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b + add v22.4s,v22.4s,v23.4s add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b + add v26.4s,v26.4s,v27.4s add w15,w15,w20 - ushr v1.4s,v20.4s,#20 + add v30.4s,v30.4s,v31.4s add w16,w16,w21 - ushr v5.4s,v21.4s,#20 + eor v4.16b,v17.16b,v18.16b eor w9,w9,w13 - ushr v17.4s,v22.4s,#20 + eor v5.16b,v21.16b,v22.16b eor w10,w10,w14 - sli v1.4s,v20.4s,#12 + eor v6.16b,v25.16b,v26.16b eor w11,w11,w15 - sli v5.4s,v21.4s,#12 + eor v7.16b,v29.16b,v30.16b eor w12,w12,w16 - sli v17.4s,v22.4s,#12 + ushr v17.4s,v4.4s,#20 ror w9,w9,#20 - add v0.4s,v0.4s,v1.4s + ushr v21.4s,v5.4s,#20 ror w10,w10,#20 - add v4.4s,v4.4s,v5.4s + ushr v25.4s,v6.4s,#20 ror w11,w11,#20 - add v16.4s,v16.4s,v17.4s + ushr v29.4s,v7.4s,#20 ror w12,w12,#20 - eor v20.16b,v3.16b,v0.16b + sli v17.4s,v4.4s,#12 add w5,w5,w9 - eor v21.16b,v7.16b,v4.16b + sli v21.4s,v5.4s,#12 add w6,w6,w10 - eor v22.16b,v19.16b,v16.16b + sli v25.4s,v6.4s,#12 add w7,w7,w11 - ushr v3.4s,v20.4s,#24 + sli v29.4s,v7.4s,#12 add w8,w8,w12 - ushr v7.4s,v21.4s,#24 + add v16.4s,v16.4s,v17.4s eor w17,w17,w5 - ushr v19.4s,v22.4s,#24 + add v20.4s,v20.4s,v21.4s eor w19,w19,w6 - sli v3.4s,v20.4s,#8 + add v24.4s,v24.4s,v25.4s eor w20,w20,w7 - sli v7.4s,v21.4s,#8 + add v28.4s,v28.4s,v29.4s eor w21,w21,w8 - sli v19.4s,v22.4s,#8 + eor v4.16b,v19.16b,v16.16b ror w17,w17,#24 - add v2.4s,v2.4s,v3.4s + eor v5.16b,v23.16b,v20.16b ror w19,w19,#24 - add v6.4s,v6.4s,v7.4s + eor v6.16b,v27.16b,v24.16b ror w20,w20,#24 - add v18.4s,v18.4s,v19.4s + eor v7.16b,v31.16b,v28.16b ror w21,w21,#24 - eor v20.16b,v1.16b,v2.16b + tbl v19.16b,{v4.16b},v9.16b add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b + tbl v23.16b,{v5.16b},v9.16b add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b + tbl v27.16b,{v6.16b},v9.16b add w15,w15,w20 - ushr v1.4s,v20.4s,#25 + tbl v31.16b,{v7.16b},v9.16b add w16,w16,w21 - ushr v5.4s,v21.4s,#25 + add v18.4s,v18.4s,v19.4s eor w9,w9,w13 - ushr v17.4s,v22.4s,#25 + add v22.4s,v22.4s,v23.4s eor w10,w10,w14 - sli v1.4s,v20.4s,#7 + add v26.4s,v26.4s,v27.4s eor w11,w11,w15 - sli v5.4s,v21.4s,#7 + add v30.4s,v30.4s,v31.4s eor w12,w12,w16 - sli v17.4s,v22.4s,#7 + eor v4.16b,v17.16b,v18.16b ror w9,w9,#25 - ext v2.16b,v2.16b,v2.16b,#8 + eor v5.16b,v21.16b,v22.16b ror w10,w10,#25 - ext v6.16b,v6.16b,v6.16b,#8 + eor v6.16b,v25.16b,v26.16b ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 + eor v7.16b,v29.16b,v30.16b ror w12,w12,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - add v0.4s,v0.4s,v1.4s + ushr v17.4s,v4.4s,#25 + ushr v21.4s,v5.4s,#25 + ushr v25.4s,v6.4s,#25 + ushr v29.4s,v7.4s,#25 + sli v17.4s,v4.4s,#7 + sli v21.4s,v5.4s,#7 + sli v25.4s,v6.4s,#7 + sli v29.4s,v7.4s,#7 + add v16.4s,v16.4s,v21.4s add w5,w5,w10 - add v4.4s,v4.4s,v5.4s + add v20.4s,v20.4s,v25.4s add w6,w6,w11 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v29.4s add w7,w7,w12 - eor v3.16b,v3.16b,v0.16b + add v28.4s,v28.4s,v17.4s add w8,w8,w9 - eor v7.16b,v7.16b,v4.16b + eor v31.16b,v31.16b,v16.16b eor w21,w21,w5 - eor v19.16b,v19.16b,v16.16b + eor v19.16b,v19.16b,v20.16b eor w17,w17,w6 - rev32 v3.8h,v3.8h + eor v23.16b,v23.16b,v24.16b eor w19,w19,w7 - rev32 v7.8h,v7.8h + eor v27.16b,v27.16b,v28.16b eor w20,w20,w8 - rev32 v19.8h,v19.8h + rev32 v31.8h,v31.8h ror w21,w21,#16 - add v2.4s,v2.4s,v3.4s + rev32 v19.8h,v19.8h ror w17,w17,#16 - add v6.4s,v6.4s,v7.4s + rev32 v23.8h,v23.8h ror w19,w19,#16 - add v18.4s,v18.4s,v19.4s + rev32 v27.8h,v27.8h ror w20,w20,#16 - eor v20.16b,v1.16b,v2.16b + add v26.4s,v26.4s,v31.4s add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b + add v30.4s,v30.4s,v19.4s add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b + add v18.4s,v18.4s,v23.4s add w13,w13,w19 - ushr v1.4s,v20.4s,#20 + add v22.4s,v22.4s,v27.4s add w14,w14,w20 - ushr v5.4s,v21.4s,#20 + eor v4.16b,v21.16b,v26.16b eor w10,w10,w15 - ushr v17.4s,v22.4s,#20 + eor v5.16b,v25.16b,v30.16b eor w11,w11,w16 - sli v1.4s,v20.4s,#12 + eor v6.16b,v29.16b,v18.16b eor w12,w12,w13 - sli v5.4s,v21.4s,#12 + eor v7.16b,v17.16b,v22.16b eor w9,w9,w14 - sli v17.4s,v22.4s,#12 + ushr v21.4s,v4.4s,#20 ror w10,w10,#20 - add v0.4s,v0.4s,v1.4s + ushr v25.4s,v5.4s,#20 ror w11,w11,#20 - add v4.4s,v4.4s,v5.4s + ushr v29.4s,v6.4s,#20 ror w12,w12,#20 - add v16.4s,v16.4s,v17.4s + ushr v17.4s,v7.4s,#20 ror w9,w9,#20 - eor v20.16b,v3.16b,v0.16b + sli v21.4s,v4.4s,#12 add w5,w5,w10 - eor v21.16b,v7.16b,v4.16b + sli v25.4s,v5.4s,#12 add w6,w6,w11 - eor v22.16b,v19.16b,v16.16b + sli v29.4s,v6.4s,#12 add w7,w7,w12 - ushr v3.4s,v20.4s,#24 + sli v17.4s,v7.4s,#12 add w8,w8,w9 - ushr v7.4s,v21.4s,#24 + add v16.4s,v16.4s,v21.4s eor w21,w21,w5 - ushr v19.4s,v22.4s,#24 + add v20.4s,v20.4s,v25.4s eor w17,w17,w6 - sli v3.4s,v20.4s,#8 + add v24.4s,v24.4s,v29.4s eor w19,w19,w7 - sli v7.4s,v21.4s,#8 + add v28.4s,v28.4s,v17.4s eor w20,w20,w8 - sli v19.4s,v22.4s,#8 + eor v4.16b,v31.16b,v16.16b ror w21,w21,#24 - add v2.4s,v2.4s,v3.4s + eor v5.16b,v19.16b,v20.16b ror w17,w17,#24 - add v6.4s,v6.4s,v7.4s + eor v6.16b,v23.16b,v24.16b ror w19,w19,#24 - add v18.4s,v18.4s,v19.4s + eor v7.16b,v27.16b,v28.16b ror w20,w20,#24 - eor v20.16b,v1.16b,v2.16b + tbl v31.16b,{v4.16b},v9.16b add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b + tbl v19.16b,{v5.16b},v9.16b add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b + tbl v23.16b,{v6.16b},v9.16b add w13,w13,w19 - ushr v1.4s,v20.4s,#25 + tbl v27.16b,{v7.16b},v9.16b add w14,w14,w20 - ushr v5.4s,v21.4s,#25 + add v26.4s,v26.4s,v31.4s eor w10,w10,w15 - ushr v17.4s,v22.4s,#25 + add v30.4s,v30.4s,v19.4s eor w11,w11,w16 - sli v1.4s,v20.4s,#7 + add v18.4s,v18.4s,v23.4s eor w12,w12,w13 - sli v5.4s,v21.4s,#7 + add v22.4s,v22.4s,v27.4s eor w9,w9,w14 - sli v17.4s,v22.4s,#7 + eor v4.16b,v21.16b,v26.16b ror w10,w10,#25 - ext v2.16b,v2.16b,v2.16b,#8 + eor v5.16b,v25.16b,v30.16b ror w11,w11,#25 - ext v6.16b,v6.16b,v6.16b,#8 + eor v6.16b,v29.16b,v18.16b ror w12,w12,#25 - ext v18.16b,v18.16b,v18.16b,#8 + eor v7.16b,v17.16b,v22.16b ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 + ushr v21.4s,v4.4s,#25 + ushr v25.4s,v5.4s,#25 + ushr v29.4s,v6.4s,#25 + ushr v17.4s,v7.4s,#25 + sli v21.4s,v4.4s,#7 + sli v25.4s,v5.4s,#7 + sli v29.4s,v6.4s,#7 + sli v17.4s,v7.4s,#7 cbnz x4,.Loop_neon + add v19.4s,v19.4s,v8.4s + + zip1 v4.4s,v16.4s,v20.4s // transpose data + zip1 v5.4s,v24.4s,v28.4s + zip2 v6.4s,v16.4s,v20.4s + zip2 v7.4s,v24.4s,v28.4s + zip1 v16.2d,v4.2d,v5.2d + zip2 v20.2d,v4.2d,v5.2d + zip1 v24.2d,v6.2d,v7.2d + zip2 v28.2d,v6.2d,v7.2d + + zip1 v4.4s,v17.4s,v21.4s + zip1 v5.4s,v25.4s,v29.4s + zip2 v6.4s,v17.4s,v21.4s + zip2 v7.4s,v25.4s,v29.4s + zip1 v17.2d,v4.2d,v5.2d + zip2 v21.2d,v4.2d,v5.2d + zip1 v25.2d,v6.2d,v7.2d + zip2 v29.2d,v6.2d,v7.2d + + zip1 v4.4s,v18.4s,v22.4s add w5,w5,w22 // accumulate key block - add v0.4s,v0.4s,v24.4s + zip1 v5.4s,v26.4s,v30.4s add x6,x6,x22,lsr#32 - add v4.4s,v4.4s,v24.4s + zip2 v6.4s,v18.4s,v22.4s add w7,w7,w23 - add v16.4s,v16.4s,v24.4s + zip2 v7.4s,v26.4s,v30.4s add x8,x8,x23,lsr#32 - add v2.4s,v2.4s,v26.4s + zip1 v18.2d,v4.2d,v5.2d add w9,w9,w24 - add v6.4s,v6.4s,v26.4s + zip2 v22.2d,v4.2d,v5.2d add x10,x10,x24,lsr#32 - add v18.4s,v18.4s,v26.4s + zip1 v26.2d,v6.2d,v7.2d add w11,w11,w25 - add v3.4s,v3.4s,v27.4s + zip2 v30.2d,v6.2d,v7.2d add x12,x12,x25,lsr#32 + + zip1 v4.4s,v19.4s,v23.4s add w13,w13,w26 - add v7.4s,v7.4s,v28.4s + zip1 v5.4s,v27.4s,v31.4s add x14,x14,x26,lsr#32 + zip2 v6.4s,v19.4s,v23.4s add w15,w15,w27 - add v19.4s,v19.4s,v29.4s + zip2 v7.4s,v27.4s,v31.4s add x16,x16,x27,lsr#32 + zip1 v19.2d,v4.2d,v5.2d add w17,w17,w28 - add v1.4s,v1.4s,v25.4s + zip2 v23.2d,v4.2d,v5.2d add x19,x19,x28,lsr#32 + zip1 v27.2d,v6.2d,v7.2d add w20,w20,w30 - add v5.4s,v5.4s,v25.4s + zip2 v31.2d,v6.2d,v7.2d add x21,x21,x30,lsr#32 - add v17.4s,v17.4s,v25.4s b.lo .Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input + add v16.4s,v16.4s,v0.4s // accumulate key block add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] + add v17.4s,v17.4s,v1.4s add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] + add v18.4s,v18.4s,v2.4s add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] + add v19.4s,v19.4s,v3.4s add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -645,48 +679,68 @@ ChaCha20_neon: rev x17,x17 rev x20,x20 #endif - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor x5,x5,x6 + add v20.4s,v20.4s,v0.4s eor x7,x7,x8 + add v21.4s,v21.4s,v1.4s eor x9,x9,x10 + add v22.4s,v22.4s,v2.4s eor x11,x11,x12 + add v23.4s,v23.4s,v3.4s eor x13,x13,x14 - eor v0.16b,v0.16b,v20.16b + eor v16.16b,v16.16b,v4.16b + movi v4.4s,#5 eor x15,x15,x16 - eor v1.16b,v1.16b,v21.16b + eor v17.16b,v17.16b,v5.16b eor x17,x17,x19 - eor v2.16b,v2.16b,v22.16b + eor v18.16b,v18.16b,v6.16b eor x20,x20,x21 - eor v3.16b,v3.16b,v23.16b - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v19.16b,v19.16b,v7.16b + add v8.4s,v8.4s,v4.4s // += 5 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter + add x28,x28,#5 // increment counter stp x9,x11,[x0,#16] - add v27.4s,v27.4s,v31.4s // += 4 stp x13,x15,[x0,#32] - add v28.4s,v28.4s,v31.4s stp x17,x20,[x0,#48] - add v29.4s,v29.4s,v31.4s add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + add v24.4s,v24.4s,v0.4s + add v25.4s,v25.4s,v1.4s + add v26.4s,v26.4s,v2.4s + add v27.4s,v27.4s,v3.4s + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 + + eor v20.16b,v20.16b,v4.16b + eor v21.16b,v21.16b,v5.16b + eor v22.16b,v22.16b,v6.16b + eor v23.16b,v23.16b,v7.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v1.4s + add v30.4s,v30.4s,v2.4s + add v31.4s,v31.4s,v3.4s + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + eor v24.16b,v24.16b,v16.16b + eor v25.16b,v25.16b,v17.16b + eor v26.16b,v26.16b,v18.16b + eor v27.16b,v27.16b,v19.16b + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 - eor v16.16b,v16.16b,v0.16b - eor v17.16b,v17.16b,v1.16b - eor v18.16b,v18.16b,v2.16b - eor v19.16b,v19.16b,v3.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + eor v28.16b,v28.16b,v20.16b + eor v29.16b,v29.16b,v21.16b + eor v30.16b,v30.16b,v22.16b + eor v31.16b,v31.16b,v23.16b + st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 b.hi .Loop_outer_neon + ldp d8,d9,[sp] // meet ABI requirements + ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] @@ -697,8 +751,10 @@ ChaCha20_neon: .inst 0xd50323bf // autiasp ret +.align 4 .Ltail_neon: - add x2,x2,#256 + add x2,x2,#320 + ldp d8,d9,[sp] // meet ABI requirements cmp x2,#64 b.lo .Less_than_64 @@ -715,7 +771,7 @@ ChaCha20_neon: add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -735,48 +791,68 @@ ChaCha20_neon: eor x20,x20,x21 stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter + add v16.4s,v16.4s,v0.4s // accumulate key block stp x9,x11,[x0,#16] + add v17.4s,v17.4s,v1.4s stp x13,x15,[x0,#32] + add v18.4s,v18.4s,v2.4s stp x17,x20,[x0,#48] + add v19.4s,v19.4s,v3.4s add x0,x0,#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 - b.lo .Less_than_128 + b.lo .Last_neon - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v0.16b,v0.16b,v20.16b - eor v1.16b,v1.16b,v21.16b - eor v2.16b,v2.16b,v22.16b - eor v3.16b,v3.16b,v23.16b - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v16.16b,v16.16b,v4.16b + eor v17.16b,v17.16b,v5.16b + eor v18.16b,v18.16b,v6.16b + eor v19.16b,v19.16b,v7.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 b.eq .Ldone_neon + + add v16.4s,v20.4s,v0.4s + add v17.4s,v21.4s,v1.4s sub x2,x2,#64 + add v18.4s,v22.4s,v2.4s cmp x2,#64 - b.lo .Less_than_192 + add v19.4s,v23.4s,v3.4s + b.lo .Last_neon - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v20.16b,v16.16b,v4.16b + eor v21.16b,v17.16b,v5.16b + eor v22.16b,v18.16b,v6.16b + eor v23.16b,v19.16b,v7.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 b.eq .Ldone_neon + + add v16.4s,v24.4s,v0.4s + add v17.4s,v25.4s,v1.4s sub x2,x2,#64 + add v18.4s,v26.4s,v2.4s + cmp x2,#64 + add v19.4s,v27.4s,v3.4s + b.lo .Last_neon - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b .Last_neon + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v24.16b,v16.16b,v4.16b + eor v25.16b,v17.16b,v5.16b + eor v26.16b,v18.16b,v6.16b + eor v27.16b,v19.16b,v7.16b + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + b.eq .Ldone_neon -.Less_than_128: - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b .Last_neon -.Less_than_192: - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b .Last_neon + add v16.4s,v28.4s,v0.4s + add v17.4s,v29.4s,v1.4s + add v18.4s,v30.4s,v2.4s + add v19.4s,v31.4s,v3.4s + sub x2,x2,#64 -.align 4 .Last_neon: + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 @@ -824,16 +900,18 @@ ChaCha20_512_neon: .L512_or_more_neon: sub sp,sp,#128+64 + eor v7.16b,v7.16b,v7.16b ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 + ld1 {v0.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] + ld1 {v1.4s,v2.4s},[x3] ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __ARMEB__ - rev64 v24.4s,v24.4s + ld1 {v3.4s},[x4] + ld1 {v7.s}[0],[x5] + add x3,x5,#16 // .Lrot24 +#ifdef __AARCH64EB__ + rev64 v0.4s,v0.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 @@ -841,14 +919,14 @@ ChaCha20_512_neon: ror x28,x28,#32 ror x30,x30,#32 #endif - add v27.4s,v27.4s,v31.4s // += 1 - stp q24,q25,[sp,#0] // off-load key block, invariant part - add v27.4s,v27.4s,v31.4s // not typo - str q26,[sp,#32] - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - add v30.4s,v29.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 + add v3.4s,v3.4s,v7.4s // += 1 + stp q0,q1,[sp,#0] // off-load key block, invariant part + add v3.4s,v3.4s,v7.4s // not typo + str q2,[sp,#32] + add v4.4s,v3.4s,v7.4s + add v5.4s,v4.4s,v7.4s + add v6.4s,v5.4s,v7.4s + shl v7.4s,v7.4s,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] @@ -858,461 +936,450 @@ ChaCha20_512_neon: sub x2,x2,#512 // not typo .Loop_outer_512_neon: - mov v0.16b,v24.16b - mov v4.16b,v24.16b - mov v8.16b,v24.16b - mov v12.16b,v24.16b - mov v16.16b,v24.16b - mov v20.16b,v24.16b - mov v1.16b,v25.16b + mov v8.16b,v0.16b + mov v12.16b,v0.16b + mov v16.16b,v0.16b + mov v20.16b,v0.16b + mov v24.16b,v0.16b + mov v28.16b,v0.16b + mov v9.16b,v1.16b mov w5,w22 // unpack key block - mov v5.16b,v25.16b + mov v13.16b,v1.16b lsr x6,x22,#32 - mov v9.16b,v25.16b + mov v17.16b,v1.16b mov w7,w23 - mov v13.16b,v25.16b + mov v21.16b,v1.16b lsr x8,x23,#32 - mov v17.16b,v25.16b + mov v25.16b,v1.16b mov w9,w24 - mov v21.16b,v25.16b + mov v29.16b,v1.16b lsr x10,x24,#32 - mov v3.16b,v27.16b + mov v11.16b,v3.16b mov w11,w25 - mov v7.16b,v28.16b + mov v15.16b,v4.16b lsr x12,x25,#32 - mov v11.16b,v29.16b + mov v19.16b,v5.16b mov w13,w26 - mov v15.16b,v30.16b + mov v23.16b,v6.16b lsr x14,x26,#32 - mov v2.16b,v26.16b + mov v10.16b,v2.16b mov w15,w27 - mov v6.16b,v26.16b + mov v14.16b,v2.16b lsr x16,x27,#32 - add v19.4s,v3.4s,v31.4s // +4 + add v27.4s,v11.4s,v7.4s // +4 mov w17,w28 - add v23.4s,v7.4s,v31.4s // +4 + add v31.4s,v15.4s,v7.4s // +4 lsr x19,x28,#32 - mov v10.16b,v26.16b + mov v18.16b,v2.16b mov w20,w30 - mov v14.16b,v26.16b + mov v22.16b,v2.16b lsr x21,x30,#32 - mov v18.16b,v26.16b - stp q27,q28,[sp,#48] // off-load key block, variable part - mov v22.16b,v26.16b - str q29,[sp,#80] + mov v26.16b,v2.16b + stp q3,q4,[sp,#48] // off-load key block, variable part + mov v30.16b,v2.16b + stp q5,q6,[sp,#80] mov x4,#5 + ld1 {v6.4s},[x3] subs x2,x2,#512 .Loop_upper_neon: sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h + rev32 v11.8h,v11.8h add w13,w13,w17 - rev32 v7.8h,v7.8h + rev32 v15.8h,v15.8h add w14,w14,w19 - rev32 v11.8h,v11.8h + rev32 v19.8h,v19.8h add w15,w15,w20 - rev32 v15.8h,v15.8h + rev32 v23.8h,v23.8h add w16,w16,w21 - rev32 v19.8h,v19.8h + rev32 v27.8h,v27.8h eor w9,w9,w13 - rev32 v23.8h,v23.8h + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 + add v10.4s,v10.4s,v11.4s add w15,w15,w21 - sli v7.4s,v25.4s,#8 + add v14.4s,v14.4s,v15.4s add w16,w16,w17 - sli v11.4s,v26.4s,#8 + add v18.4s,v18.4s,v19.4s add w13,w13,w19 - sli v15.4s,v27.4s,#8 + add v22.4s,v22.4s,v23.4s add w14,w14,w20 - sli v19.4s,v28.4s,#8 + add v26.4s,v26.4s,v27.4s eor w10,w10,w15 - sli v23.4s,v29.4s,#8 + add v30.4s,v30.4s,v31.4s eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s + eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s + eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s + eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s + eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s + eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 + ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 - sli v5.4s,v25.4s,#7 + ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 - sli v9.4s,v26.4s,#7 + ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 - sli v13.4s,v27.4s,#7 + ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 - sli v17.4s,v28.4s,#7 + ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 - sli v21.4s,v29.4s,#7 + ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 + ext v11.16b,v11.16b,v11.16b,#12 eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 + ext v15.16b,v15.16b,v15.16b,#12 eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 + ext v19.16b,v19.16b,v19.16b,#12 ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 + ext v23.16b,v23.16b,v23.16b,#12 ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 + ext v27.16b,v27.16b,v27.16b,#12 ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 + ext v31.16b,v31.16b,v31.16b,#12 ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s + ext v25.16b,v25.16b,v25.16b,#4 + ext v29.16b,v29.16b,v29.16b,#4 + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h + rev32 v11.8h,v11.8h add w13,w13,w17 - rev32 v7.8h,v7.8h + rev32 v15.8h,v15.8h add w14,w14,w19 - rev32 v11.8h,v11.8h + rev32 v19.8h,v19.8h add w15,w15,w20 - rev32 v15.8h,v15.8h + rev32 v23.8h,v23.8h add w16,w16,w21 - rev32 v19.8h,v19.8h + rev32 v27.8h,v27.8h eor w9,w9,w13 - rev32 v23.8h,v23.8h + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 + add v10.4s,v10.4s,v11.4s add w15,w15,w21 - sli v7.4s,v25.4s,#8 + add v14.4s,v14.4s,v15.4s add w16,w16,w17 - sli v11.4s,v26.4s,#8 + add v18.4s,v18.4s,v19.4s add w13,w13,w19 - sli v15.4s,v27.4s,#8 + add v22.4s,v22.4s,v23.4s add w14,w14,w20 - sli v19.4s,v28.4s,#8 + add v26.4s,v26.4s,v27.4s eor w10,w10,w15 - sli v23.4s,v29.4s,#8 + add v30.4s,v30.4s,v31.4s eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s + eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s + eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s + eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s + eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s + eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 + ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 - sli v5.4s,v25.4s,#7 + ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 - sli v9.4s,v26.4s,#7 + ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 - sli v13.4s,v27.4s,#7 + ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 - sli v17.4s,v28.4s,#7 + ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 - sli v21.4s,v29.4s,#7 + ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 + ext v11.16b,v11.16b,v11.16b,#4 eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 + ext v15.16b,v15.16b,v15.16b,#4 eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 + ext v19.16b,v19.16b,v19.16b,#4 ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 + ext v23.16b,v23.16b,v23.16b,#4 ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 + ext v27.16b,v27.16b,v27.16b,#4 ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 + ext v31.16b,v31.16b,v31.16b,#4 ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 + ext v25.16b,v25.16b,v25.16b,#12 + ext v29.16b,v29.16b,v29.16b,#12 cbnz x4,.Loop_upper_neon add w5,w5,w22 // accumulate key block @@ -1345,7 +1412,7 @@ ChaCha20_512_neon: add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -1390,476 +1457,465 @@ ChaCha20_512_neon: mov x4,#5 .Loop_lower_neon: sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h + rev32 v11.8h,v11.8h add w13,w13,w17 - rev32 v7.8h,v7.8h + rev32 v15.8h,v15.8h add w14,w14,w19 - rev32 v11.8h,v11.8h + rev32 v19.8h,v19.8h add w15,w15,w20 - rev32 v15.8h,v15.8h + rev32 v23.8h,v23.8h add w16,w16,w21 - rev32 v19.8h,v19.8h + rev32 v27.8h,v27.8h eor w9,w9,w13 - rev32 v23.8h,v23.8h + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 + add v10.4s,v10.4s,v11.4s add w15,w15,w21 - sli v7.4s,v25.4s,#8 + add v14.4s,v14.4s,v15.4s add w16,w16,w17 - sli v11.4s,v26.4s,#8 + add v18.4s,v18.4s,v19.4s add w13,w13,w19 - sli v15.4s,v27.4s,#8 + add v22.4s,v22.4s,v23.4s add w14,w14,w20 - sli v19.4s,v28.4s,#8 + add v26.4s,v26.4s,v27.4s eor w10,w10,w15 - sli v23.4s,v29.4s,#8 + add v30.4s,v30.4s,v31.4s eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s + eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s + eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s + eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s + eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s + eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 + ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 - sli v5.4s,v25.4s,#7 + ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 - sli v9.4s,v26.4s,#7 + ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 - sli v13.4s,v27.4s,#7 + ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 - sli v17.4s,v28.4s,#7 + ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 - sli v21.4s,v29.4s,#7 + ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 + ext v11.16b,v11.16b,v11.16b,#12 eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 + ext v15.16b,v15.16b,v15.16b,#12 eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 + ext v19.16b,v19.16b,v19.16b,#12 ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 + ext v23.16b,v23.16b,v23.16b,#12 ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 + ext v27.16b,v27.16b,v27.16b,#12 ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 + ext v31.16b,v31.16b,v31.16b,#12 ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s + ext v25.16b,v25.16b,v25.16b,#4 + ext v29.16b,v29.16b,v29.16b,#4 + add v8.4s,v8.4s,v9.4s add w5,w5,w9 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s add w6,w6,w10 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s add w7,w7,w11 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s add w8,w8,w12 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 - rev32 v3.8h,v3.8h + rev32 v11.8h,v11.8h add w13,w13,w17 - rev32 v7.8h,v7.8h + rev32 v15.8h,v15.8h add w14,w14,w19 - rev32 v11.8h,v11.8h + rev32 v19.8h,v19.8h add w15,w15,w20 - rev32 v15.8h,v15.8h + rev32 v23.8h,v23.8h add w16,w16,w21 - rev32 v19.8h,v19.8h + rev32 v27.8h,v27.8h eor w9,w9,w13 - rev32 v23.8h,v23.8h + rev32 v31.8h,v31.8h eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s + add v10.4s,v10.4s,v11.4s eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s + add v14.4s,v14.4s,v15.4s eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s + add v18.4s,v18.4s,v19.4s ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s + add v22.4s,v22.4s,v23.4s ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s + add v26.4s,v26.4s,v27.4s ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s + add v30.4s,v30.4s,v31.4s ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b + eor v0.16b,v9.16b,v10.16b add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b + eor v1.16b,v13.16b,v14.16b add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b + eor v2.16b,v17.16b,v18.16b add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b + eor v3.16b,v21.16b,v22.16b add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b + eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b + eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 + ushr v9.4s,v0.4s,#20 eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 + ushr v13.4s,v1.4s,#20 eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 + ushr v17.4s,v2.4s,#20 ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 + ushr v21.4s,v3.4s,#20 ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 + ushr v25.4s,v4.4s,#20 ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 + ushr v29.4s,v5.4s,#20 ror w21,w21,#24 - sli v1.4s,v24.4s,#12 + sli v9.4s,v0.4s,#12 add w13,w13,w17 - sli v5.4s,v25.4s,#12 + sli v13.4s,v1.4s,#12 add w14,w14,w19 - sli v9.4s,v26.4s,#12 + sli v17.4s,v2.4s,#12 add w15,w15,w20 - sli v13.4s,v27.4s,#12 + sli v21.4s,v3.4s,#12 add w16,w16,w21 - sli v17.4s,v28.4s,#12 + sli v25.4s,v4.4s,#12 eor w9,w9,w13 - sli v21.4s,v29.4s,#12 + sli v29.4s,v5.4s,#12 eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s + add v8.4s,v8.4s,v9.4s eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s + add v12.4s,v12.4s,v13.4s eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s + add v16.4s,v16.4s,v17.4s ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s + add v20.4s,v20.4s,v21.4s ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s + add v24.4s,v24.4s,v25.4s ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s + add v28.4s,v28.4s,v29.4s ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b + eor v11.16b,v11.16b,v8.16b add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b + eor v15.16b,v15.16b,v12.16b add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b + eor v19.16b,v19.16b,v16.16b add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b + eor v23.16b,v23.16b,v20.16b add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b + eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b + eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 + tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 + tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 + tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 + tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 + tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 + tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 - sli v3.4s,v24.4s,#8 + add v10.4s,v10.4s,v11.4s add w15,w15,w21 - sli v7.4s,v25.4s,#8 + add v14.4s,v14.4s,v15.4s add w16,w16,w17 - sli v11.4s,v26.4s,#8 + add v18.4s,v18.4s,v19.4s add w13,w13,w19 - sli v15.4s,v27.4s,#8 + add v22.4s,v22.4s,v23.4s add w14,w14,w20 - sli v19.4s,v28.4s,#8 + add v26.4s,v26.4s,v27.4s eor w10,w10,w15 - sli v23.4s,v29.4s,#8 + add v30.4s,v30.4s,v31.4s eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s + eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s + eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s + eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s + eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s + eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s + eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b + ushr v9.4s,v0.4s,#25 add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b + ushr v13.4s,v1.4s,#25 add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b + ushr v17.4s,v2.4s,#25 add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b + ushr v21.4s,v3.4s,#25 add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b + ushr v25.4s,v4.4s,#25 eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b + ushr v29.4s,v5.4s,#25 eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 + sli v9.4s,v0.4s,#7 eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 + sli v13.4s,v1.4s,#7 eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 + sli v17.4s,v2.4s,#7 ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 + sli v21.4s,v3.4s,#7 ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 + sli v25.4s,v4.4s,#7 ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 + sli v29.4s,v5.4s,#7 ror w20,w20,#24 - sli v1.4s,v24.4s,#7 + ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 - sli v5.4s,v25.4s,#7 + ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 - sli v9.4s,v26.4s,#7 + ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 - sli v13.4s,v27.4s,#7 + ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 - sli v17.4s,v28.4s,#7 + ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 - sli v21.4s,v29.4s,#7 + ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 + ext v11.16b,v11.16b,v11.16b,#4 eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 + ext v15.16b,v15.16b,v15.16b,#4 eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 + ext v19.16b,v19.16b,v19.16b,#4 ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 + ext v23.16b,v23.16b,v23.16b,#4 ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 + ext v27.16b,v27.16b,v27.16b,#4 ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 + ext v31.16b,v31.16b,v31.16b,#4 ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 + ext v25.16b,v25.16b,v25.16b,#12 + ext v29.16b,v29.16b,v29.16b,#12 cbnz x4,.Loop_lower_neon add w5,w5,w22 // accumulate key block - ldp q24,q25,[sp,#0] + ldp q0,q1,[sp,#0] add x6,x6,x22,lsr#32 - ldp q26,q27,[sp,#32] + ldp q2,q3,[sp,#32] add w7,w7,w23 - ldp q28,q29,[sp,#64] + ldp q4,q5,[sp,#64] add x8,x8,x23,lsr#32 - add v0.4s,v0.4s,v24.4s + ldr q6,[sp,#96] + add v8.4s,v8.4s,v0.4s add w9,w9,w24 - add v4.4s,v4.4s,v24.4s + add v12.4s,v12.4s,v0.4s add x10,x10,x24,lsr#32 - add v8.4s,v8.4s,v24.4s + add v16.4s,v16.4s,v0.4s add w11,w11,w25 - add v12.4s,v12.4s,v24.4s + add v20.4s,v20.4s,v0.4s add x12,x12,x25,lsr#32 - add v16.4s,v16.4s,v24.4s + add v24.4s,v24.4s,v0.4s add w13,w13,w26 - add v20.4s,v20.4s,v24.4s + add v28.4s,v28.4s,v0.4s add x14,x14,x26,lsr#32 - add v2.4s,v2.4s,v26.4s + add v10.4s,v10.4s,v2.4s add w15,w15,w27 - add v6.4s,v6.4s,v26.4s + add v14.4s,v14.4s,v2.4s add x16,x16,x27,lsr#32 - add v10.4s,v10.4s,v26.4s + add v18.4s,v18.4s,v2.4s add w17,w17,w28 - add v14.4s,v14.4s,v26.4s + add v22.4s,v22.4s,v2.4s add x19,x19,x28,lsr#32 - add v18.4s,v18.4s,v26.4s + add v26.4s,v26.4s,v2.4s add w20,w20,w30 - add v22.4s,v22.4s,v26.4s + add v30.4s,v30.4s,v2.4s add x21,x21,x30,lsr#32 - add v19.4s,v19.4s,v31.4s // +4 + add v27.4s,v27.4s,v7.4s // +4 add x5,x5,x6,lsl#32 // pack - add v23.4s,v23.4s,v31.4s // +4 + add v31.4s,v31.4s,v7.4s // +4 add x7,x7,x8,lsl#32 - add v3.4s,v3.4s,v27.4s + add v11.4s,v11.4s,v3.4s ldp x6,x8,[x1,#0] // load input - add v7.4s,v7.4s,v28.4s + add v15.4s,v15.4s,v4.4s add x9,x9,x10,lsl#32 - add v11.4s,v11.4s,v29.4s + add v19.4s,v19.4s,v5.4s add x11,x11,x12,lsl#32 - add v15.4s,v15.4s,v30.4s + add v23.4s,v23.4s,v6.4s ldp x10,x12,[x1,#16] - add v19.4s,v19.4s,v27.4s + add v27.4s,v27.4s,v3.4s add x13,x13,x14,lsl#32 - add v23.4s,v23.4s,v28.4s + add v31.4s,v31.4s,v4.4s add x15,x15,x16,lsl#32 - add v1.4s,v1.4s,v25.4s + add v9.4s,v9.4s,v1.4s ldp x14,x16,[x1,#32] - add v5.4s,v5.4s,v25.4s + add v13.4s,v13.4s,v1.4s add x17,x17,x19,lsl#32 - add v9.4s,v9.4s,v25.4s + add v17.4s,v17.4s,v1.4s add x20,x20,x21,lsl#32 - add v13.4s,v13.4s,v25.4s + add v21.4s,v21.4s,v1.4s ldp x19,x21,[x1,#48] - add v17.4s,v17.4s,v25.4s + add v25.4s,v25.4s,v1.4s add x1,x1,#64 - add v21.4s,v21.4s,v25.4s + add v29.4s,v29.4s,v1.4s -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 @@ -1869,20 +1925,20 @@ ChaCha20_512_neon: rev x17,x17 rev x20,x20 #endif - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 - eor v0.16b,v0.16b,v24.16b + eor v8.16b,v8.16b,v0.16b eor x15,x15,x16 - eor v1.16b,v1.16b,v25.16b + eor v9.16b,v9.16b,v1.16b eor x17,x17,x19 - eor v2.16b,v2.16b,v26.16b + eor v10.16b,v10.16b,v2.16b eor x20,x20,x21 - eor v3.16b,v3.16b,v27.16b - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor v11.16b,v11.16b,v3.16b + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#7 // increment counter @@ -1890,82 +1946,83 @@ ChaCha20_512_neon: stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - eor v4.16b,v4.16b,v24.16b - eor v5.16b,v5.16b,v25.16b - eor v6.16b,v6.16b,v26.16b - eor v7.16b,v7.16b,v27.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - eor v8.16b,v8.16b,v0.16b - ldp q24,q25,[sp,#0] - eor v9.16b,v9.16b,v1.16b - ldp q26,q27,[sp,#32] - eor v10.16b,v10.16b,v2.16b - eor v11.16b,v11.16b,v3.16b st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 - eor v12.16b,v12.16b,v4.16b - eor v13.16b,v13.16b,v5.16b - eor v14.16b,v14.16b,v6.16b - eor v15.16b,v15.16b,v7.16b + eor v12.16b,v12.16b,v0.16b + eor v13.16b,v13.16b,v1.16b + eor v14.16b,v14.16b,v2.16b + eor v15.16b,v15.16b,v3.16b st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 eor v16.16b,v16.16b,v8.16b + ldp q0,q1,[sp,#0] eor v17.16b,v17.16b,v9.16b + ldp q2,q3,[sp,#32] eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - shl v0.4s,v31.4s,#1 // 4 -> 8 + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 - add v27.4s,v27.4s,v0.4s // += 8 - add v28.4s,v28.4s,v0.4s - add v29.4s,v29.4s,v0.4s - add v30.4s,v30.4s,v0.4s + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v24.16b,v24.16b,v16.16b + eor v25.16b,v25.16b,v17.16b + eor v26.16b,v26.16b,v18.16b + eor v27.16b,v27.16b,v19.16b + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + + shl v8.4s,v7.4s,#1 // 4 -> 8 + eor v28.16b,v28.16b,v20.16b + eor v29.16b,v29.16b,v21.16b + eor v30.16b,v30.16b,v22.16b + eor v31.16b,v31.16b,v23.16b + st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 + + add v3.4s,v3.4s,v8.4s // += 8 + add v4.4s,v4.4s,v8.4s + add v5.4s,v5.4s,v8.4s + add v6.4s,v6.4s,v8.4s b.hs .Loop_outer_512_neon adds x2,x2,#512 - ushr v0.4s,v31.4s,#2 // 4 -> 1 + ushr v7.4s,v7.4s,#1 // 4 -> 2 - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] + ldp d10,d11,[sp,#128+16] // meet ABI requirements ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] - stp q24,q31,[sp,#0] // wipe off-load area - stp q24,q31,[sp,#32] - stp q24,q31,[sp,#64] + stp q0,q0,[sp,#0] // wipe off-load area + stp q0,q0,[sp,#32] + stp q0,q0,[sp,#64] b.eq .Ldone_512_neon + sub x3,x3,#16 // .Lone cmp x2,#192 - sub v27.4s,v27.4s,v0.4s // -= 1 - sub v28.4s,v28.4s,v0.4s - sub v29.4s,v29.4s,v0.4s add sp,sp,#128 + sub v3.4s,v3.4s,v7.4s // -= 2 + ld1 {v8.4s,v9.4s},[x3] b.hs .Loop_outer_neon - eor v25.16b,v25.16b,v25.16b - eor v26.16b,v26.16b,v26.16b - eor v27.16b,v27.16b,v27.16b - eor v28.16b,v28.16b,v28.16b - eor v29.16b,v29.16b,v29.16b - eor v30.16b,v30.16b,v30.16b + ldp d8,d9,[sp,#0] // meet ABI requirements + eor v1.16b,v1.16b,v1.16b + eor v2.16b,v2.16b,v2.16b + eor v3.16b,v3.16b,v3.16b + eor v4.16b,v4.16b,v4.16b + eor v5.16b,v5.16b,v5.16b + eor v6.16b,v6.16b,v6.16b b .Loop_outer .Ldone_512_neon: + ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] diff --git a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S index fc74b7095cb84..d297ac15605cf 100644 --- a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S +++ b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S @@ -3823,7 +3823,7 @@ ecp_nistz256_ord_mul_mont: //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], -// int rep); +// uint64_t rep); .globl ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,%function .align 4 @@ -4023,7 +4023,7 @@ ecp_nistz256_scatter_w5: ldp x4,x5,[x1] // X ldp x6,x7,[x1,#16] - str w4,[x0,#64*0-4] + stur w4,[x0,#64*0-4] lsr x4,x4,#32 str w5,[x0,#64*1-4] lsr x5,x5,#32 @@ -4039,7 +4039,7 @@ ecp_nistz256_scatter_w5: ldp x4,x5,[x1,#32] // Y ldp x6,x7,[x1,#48] - str w4,[x0,#64*0-4] + stur w4,[x0,#64*0-4] lsr x4,x4,#32 str w5,[x0,#64*1-4] lsr x5,x5,#32 @@ -4055,7 +4055,7 @@ ecp_nistz256_scatter_w5: ldp x4,x5,[x1,#64] // Z ldp x6,x7,[x1,#80] - str w4,[x0,#64*0-4] + stur w4,[x0,#64*0-4] lsr x4,x4,#32 str w5,[x0,#64*1-4] lsr x5,x5,#32 diff --git a/sys/crypto/openssl/aarch64/ghashv8-armx.S b/sys/crypto/openssl/aarch64/ghashv8-armx.S index 387d3c1f27b86..fe5cd49a25aa3 100644 --- a/sys/crypto/openssl/aarch64/ghashv8-armx.S +++ b/sys/crypto/openssl/aarch64/ghashv8-armx.S @@ -2,6 +2,7 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 +.arch armv8-a+crypto .text .globl gcm_init_v8 .type gcm_init_v8,%function diff --git a/sys/crypto/openssl/aarch64/keccak1600-armv8.S b/sys/crypto/openssl/aarch64/keccak1600-armv8.S index d52a88d8d0fd9..cf63318a8d087 100644 --- a/sys/crypto/openssl/aarch64/keccak1600-armv8.S +++ b/sys/crypto/openssl/aarch64/keccak1600-armv8.S @@ -574,22 +574,22 @@ SHA3_squeeze: .type KeccakF1600_ce,%function .align 5 KeccakF1600_ce: - mov x9,#12 + mov x9,#24 adr x10,iotas b .Loop_ce .align 4 .Loop_ce: ////////////////////////////////////////////////// Theta -.inst 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b -.inst 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b -.inst 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b -.inst 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b -.inst 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b -.inst 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b -.inst 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b -.inst 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b -.inst 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b -.inst 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b +.inst 0xce0f2a99 //eor3 v25.16b,v20.16b,v15.16b,v10.16b +.inst 0xce102eba //eor3 v26.16b,v21.16b,v16.16b,v11.16b +.inst 0xce1132db //eor3 v27.16b,v22.16b,v17.16b,v12.16b +.inst 0xce1236fc //eor3 v28.16b,v23.16b,v18.16b,v13.16b +.inst 0xce133b1d //eor3 v29.16b,v24.16b,v19.16b,v14.16b +.inst 0xce050339 //eor3 v25.16b,v25.16b, v5.16b,v0.16b +.inst 0xce06075a //eor3 v26.16b,v26.16b, v6.16b,v1.16b +.inst 0xce070b7b //eor3 v27.16b,v27.16b, v7.16b,v2.16b +.inst 0xce080f9c //eor3 v28.16b,v28.16b, v8.16b,v3.16b +.inst 0xce0913bd //eor3 v29.16b,v29.16b, v9.16b,v4.16b .inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1] .inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2] @@ -598,13 +598,15 @@ KeccakF1600_ce: .inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0] ////////////////////////////////////////////////// Theta+Rho+Pi -.inst 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1] +.inst 0xce9efc39 //xar v25.16b, v1.16b,v30.16b,#64-1 // C[0]=A[2][0] + +.inst 0xce9e50c1 //xar v1.16b,v6.16b,v30.16b,#64-44 .inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20 .inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61 .inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39 .inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18 -.inst 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62 +.inst 0xce9f085a //xar v26.16b, v2.16b,v31.16b,#64-62 // C[1]=A[4][0] .inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43 .inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25 @@ -614,145 +616,57 @@ KeccakF1600_ce: .inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27 - eor v0.16b,v0.16b,v29.16b - ldr x11,[x10],#8 - -.inst 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3] -.inst 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15 -.inst 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10 -.inst 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6 -.inst 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3 - -.inst 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // * - -.inst 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14 +.inst 0xce9ccb1c //xar v28.16b, v24.16b,v28.16b,#64-14 // D[4]=A[0][4] .inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2 -.inst 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55 -.inst 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45 +.inst 0xce9b2508 //xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1] +.inst 0xce9e4e04 //xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3] .inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36 -.inst 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0] +.inst 0xce9b9065 //xar v5.16b,v3.16b,v27.16b,#64-28 - ////////////////////////////////////////////////// Chi+Iota - dup v31.2d,x11 // borrow C[6] -.inst 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // * -.inst 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // * -.inst 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b -.inst 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b -.inst 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b - -.inst 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // * -.inst 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // * -.inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b -.inst 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b -.inst 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b - - eor v0.16b,v28.16b,v31.16b // Iota - -.inst 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // * -.inst 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // * -.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b -.inst 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b -.inst 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b + eor v0.16b,v0.16b,v29.16b -.inst 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // * -.inst 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // * -.inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b -.inst 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b -.inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b +.inst 0xce9bae5b //xar v27.16b, v18.16b,v27.16b,#64-21 // D[3]=A[0][3] +.inst 0xce9fc623 //xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3] +.inst 0xce9ed97e //xar v30.16b, v11.16b,v30.16b,#64-10 // D[1]=A[3][2] +.inst 0xce9fe8ff //xar v31.16b, v7.16b,v31.16b,#64-6 // D[2]=A[2][1] +.inst 0xce9df55d //xar v29.16b, v10.16b,v29.16b,#64-3 // D[0]=A[1][2] -.inst 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // * -.inst 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // * + ////////////////////////////////////////////////// Chi+Iota +.inst 0xce362354 //bcax v20.16b,v26.16b, v22.16b,v8.16b // A[1][3]=A[4][1] +.inst 0xce375915 //bcax v21.16b,v8.16b,v23.16b,v22.16b // A[1][3]=A[4][1] .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b -.inst 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b -.inst 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b - ////////////////////////////////////////////////// Theta -.inst 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b -.inst 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b -.inst 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b -.inst 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b -.inst 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b -.inst 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b -.inst 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b -.inst 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b -.inst 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b -.inst 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b - -.inst 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1] -.inst 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2] -.inst 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3] -.inst 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4] -.inst 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0] +.inst 0xce3a62f7 //bcax v23.16b,v23.16b,v26.16b, v24.16b +.inst 0xce286b18 //bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1] - ////////////////////////////////////////////////// Theta+Rho+Pi -.inst 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1] -.inst 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20 -.inst 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61 -.inst 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39 -.inst 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18 - -.inst 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62 - -.inst 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43 -.inst 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25 -.inst 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8 -.inst 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56 -.inst 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41 - -.inst 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27 - - eor v0.16b,v0.16b,v16.16b - ldr x11,[x10],#8 - -.inst 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3] -.inst 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15 -.inst 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10 -.inst 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6 -.inst 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3 + ld1r {v26.2d},[x10],#8 -.inst 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // * +.inst 0xce330fd1 //bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3] +.inst 0xce2f4c72 //bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3] +.inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b +.inst 0xce3e41ef //bcax v15.16b,v15.16b,v30.16b, v16.16b +.inst 0xce237a10 //bcax v16.16b,v16.16b,v3.16b,v30.16b // A[0][3]=A[3][3] -.inst 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14 -.inst 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2 -.inst 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55 -.inst 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45 -.inst 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36 +.inst 0xce2c7f2a //bcax v10.16b,v25.16b, v12.16b,v31.16b +.inst 0xce2d33eb //bcax v11.16b,v31.16b, v13.16b,v12.16b +.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b +.inst 0xce3939ad //bcax v13.16b,v13.16b,v25.16b, v14.16b +.inst 0xce3f65ce //bcax v14.16b,v14.16b,v31.16b, v25.16b -.inst 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0] +.inst 0xce2913a7 //bcax v7.16b,v29.16b, v9.16b,v4.16b // A[0][4]=A[1][3] +.inst 0xce252488 //bcax v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3] +.inst 0xce261529 //bcax v9.16b,v9.16b,v6.16b,v5.16b +.inst 0xce3d18a5 //bcax v5.16b,v5.16b,v29.16b, v6.16b +.inst 0xce2474c6 //bcax v6.16b,v6.16b,v4.16b,v29.16b // A[0][4]=A[1][3] - ////////////////////////////////////////////////// Chi+Iota - dup v21.2d,x11 // borrow C[6] -.inst 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // * -.inst 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // * -.inst 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b -.inst 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b -.inst 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b - -.inst 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // * -.inst 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // * -.inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b -.inst 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b -.inst 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b - - eor v0.16b,v15.16b,v21.16b // Iota - -.inst 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // * -.inst 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // * -.inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b -.inst 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b -.inst 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b +.inst 0xce207363 //bcax v3.16b,v27.16b, v0.16b,v28.16b +.inst 0xce210384 //bcax v4.16b,v28.16b, v1.16b,v0.16b +.inst 0xce220400 //bcax v0.16b,v0.16b,v2.16b,v1.16b +.inst 0xce3b0821 //bcax v1.16b,v1.16b,v27.16b, v2.16b +.inst 0xce3c6c42 //bcax v2.16b,v2.16b,v28.16b, v27.16b -.inst 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // * -.inst 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // * -.inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b -.inst 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b -.inst 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b + eor v0.16b,v0.16b,v26.16b -.inst 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // * -.inst 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // * -.inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b -.inst 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b -.inst 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b subs x9,x9,#1 bne .Loop_ce diff --git a/sys/crypto/openssl/aarch64/poly1305-armv8.S b/sys/crypto/openssl/aarch64/poly1305-armv8.S index 8ea86cb222fa6..101e4fb40552d 100644 --- a/sys/crypto/openssl/aarch64/poly1305-armv8.S +++ b/sys/crypto/openssl/aarch64/poly1305-armv8.S @@ -23,17 +23,12 @@ poly1305_init: csel x0,xzr,x0,eq b.eq .Lno_key -#ifdef __ILP32__ - ldrsw x11,.LOPENSSL_armcap_P -#else - ldr x11,.LOPENSSL_armcap_P -#endif - adr x10,.LOPENSSL_armcap_P + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] ldp x7,x8,[x1] // load key mov x9,#0xfffffffc0fffffff movk x9,#0x0fff,lsl#48 - ldr w17,[x10,x11] #ifdef __ARMEB__ rev x7,x7 // flip bytes rev x8,x8 @@ -45,10 +40,10 @@ poly1305_init: tst w17,#ARMV7_NEON - adr x12,poly1305_blocks - adr x7,poly1305_blocks_neon - adr x13,poly1305_emit - adr x8,poly1305_emit_neon + adr x12,.Lpoly1305_blocks + adr x7,.Lpoly1305_blocks_neon + adr x13,.Lpoly1305_emit + adr x8,.Lpoly1305_emit_neon csel x12,x12,x7,eq csel x13,x13,x8,eq @@ -67,6 +62,7 @@ poly1305_init: .type poly1305_blocks,%function .align 5 poly1305_blocks: +.Lpoly1305_blocks: ands x2,x2,#-16 b.eq .Lno_data @@ -131,6 +127,7 @@ poly1305_blocks: .type poly1305_emit,%function .align 5 poly1305_emit: +.Lpoly1305_emit: ldp x4,x5,[x0] // load hash base 2^64 ldr x6,[x0,#16] ldp x10,x11,[x2] // load nonce @@ -225,10 +222,11 @@ poly1305_splat: .type poly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: +.Lpoly1305_blocks_neon: ldr x17,[x0,#24] cmp x2,#128 b.hs .Lblocks_neon - cbz x17,poly1305_blocks + cbz x17,.Lpoly1305_blocks .Lblocks_neon: .inst 0xd503233f // paciasp @@ -371,7 +369,7 @@ poly1305_blocks_neon: csel x16,x17,x16,lo mov x4,#1 - str x4,[x0,#-24] // set is_base2_26 + stur x4,[x0,#-24] // set is_base2_26 sub x0,x0,#48 // restore original x0 b .Ldo_neon @@ -808,6 +806,7 @@ poly1305_blocks_neon: .type poly1305_emit_neon,%function .align 5 poly1305_emit_neon: +.Lpoly1305_emit_neon: ldr x17,[x0,#24] cbz x17,poly1305_emit @@ -860,12 +859,6 @@ poly1305_emit_neon: .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/sha1-armv8.S b/sys/crypto/openssl/aarch64/sha1-armv8.S index 847de3818ea66..a9e1d81d58177 100644 --- a/sys/crypto/openssl/aarch64/sha1-armv8.S +++ b/sys/crypto/openssl/aarch64/sha1-armv8.S @@ -1,22 +1,18 @@ /* Do not modify. This file is auto-generated from sha1-armv8.pl. */ -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" -.text +.hidden OPENSSL_armcap_P +#endif +.text -.hidden OPENSSL_armcap_P .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 sha1_block_data_order: -#ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -#else - ldr x16,.LOPENSSL_armcap_P -#endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry @@ -37,7 +33,7 @@ sha1_block_data_order: movz w28,#0x7999 sub x2,x2,#1 movk w28,#0x5a82,lsl#16 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x3,x3,#32 #else rev32 x3,x3 @@ -45,7 +41,7 @@ sha1_block_data_order: add w24,w24,w28 // warm it up add w24,w24,w3 lsr x4,x3,#32 - ldr x5,[x1,#-56] + ldur x5,[x1,#-56] bic w25,w23,w21 and w26,w22,w21 ror w27,w20,#27 @@ -55,7 +51,7 @@ sha1_block_data_order: ror w21,w21,#2 add w23,w23,w4 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x5,x5,#32 #else rev32 x5,x5 @@ -70,7 +66,7 @@ sha1_block_data_order: add w22,w22,w5 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) lsr x6,x5,#32 - ldr x7,[x1,#-48] + ldur x7,[x1,#-48] bic w25,w21,w24 and w26,w20,w24 ror w27,w23,#27 @@ -80,7 +76,7 @@ sha1_block_data_order: ror w24,w24,#2 add w21,w21,w6 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x7,x7,#32 #else rev32 x7,x7 @@ -95,7 +91,7 @@ sha1_block_data_order: add w20,w20,w7 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) lsr x8,x7,#32 - ldr x9,[x1,#-40] + ldur x9,[x1,#-40] bic w25,w24,w22 and w26,w23,w22 ror w27,w21,#27 @@ -105,7 +101,7 @@ sha1_block_data_order: ror w22,w22,#2 add w24,w24,w8 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x9,x9,#32 #else rev32 x9,x9 @@ -120,7 +116,7 @@ sha1_block_data_order: add w23,w23,w9 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) lsr x10,x9,#32 - ldr x11,[x1,#-32] + ldur x11,[x1,#-32] bic w25,w22,w20 and w26,w21,w20 ror w27,w24,#27 @@ -130,7 +126,7 @@ sha1_block_data_order: ror w20,w20,#2 add w22,w22,w10 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x11,x11,#32 #else rev32 x11,x11 @@ -145,7 +141,7 @@ sha1_block_data_order: add w21,w21,w11 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) lsr x12,x11,#32 - ldr x13,[x1,#-24] + ldur x13,[x1,#-24] bic w25,w20,w23 and w26,w24,w23 ror w27,w22,#27 @@ -155,7 +151,7 @@ sha1_block_data_order: ror w23,w23,#2 add w20,w20,w12 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x13,x13,#32 #else rev32 x13,x13 @@ -170,7 +166,7 @@ sha1_block_data_order: add w24,w24,w13 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) lsr x14,x13,#32 - ldr x15,[x1,#-16] + ldur x15,[x1,#-16] bic w25,w23,w21 and w26,w22,w21 ror w27,w20,#27 @@ -180,7 +176,7 @@ sha1_block_data_order: ror w21,w21,#2 add w23,w23,w14 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x15,x15,#32 #else rev32 x15,x15 @@ -195,7 +191,7 @@ sha1_block_data_order: add w22,w22,w15 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) lsr x16,x15,#32 - ldr x17,[x1,#-8] + ldur x17,[x1,#-8] bic w25,w21,w24 and w26,w20,w24 ror w27,w23,#27 @@ -205,7 +201,7 @@ sha1_block_data_order: ror w24,w24,#2 add w21,w21,w16 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror x17,x17,#32 #else rev32 x17,x17 @@ -1211,12 +1207,6 @@ sha1_block_armv8: .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/sha256-armv8.S b/sys/crypto/openssl/aarch64/sha256-armv8.S index e3be48716e07d..f0c4a14610b8c 100644 --- a/sys/crypto/openssl/aarch64/sha256-armv8.S +++ b/sys/crypto/openssl/aarch64/sha256-armv8.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // -// Licensed under the OpenSSL license (the "License"). You may not use +// Licensed under the Apache License 2.0 (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy // in the file LICENSE in the source distribution or at // https://www.openssl.org/source/license.html @@ -28,6 +28,7 @@ // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. @@ -53,27 +54,23 @@ // deliver much less improvement, likely *negative* on Cortex-A5x. // Which is why NEON support is limited to SHA256.] +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" + +.hidden OPENSSL_armcap_P #endif .text - -.hidden OPENSSL_armcap_P .globl sha256_block_data_order .type sha256_block_data_order,%function .align 6 sha256_block_data_order: #ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry tst w16,#ARMV7_NEON @@ -1064,15 +1061,6 @@ sha256_block_data_order: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/sha512-armv8.S b/sys/crypto/openssl/aarch64/sha512-armv8.S index 2103672a35ff8..ded34d2a96b0d 100644 --- a/sys/crypto/openssl/aarch64/sha512-armv8.S +++ b/sys/crypto/openssl/aarch64/sha512-armv8.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha512-armv8.pl. */ // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. // -// Licensed under the OpenSSL license (the "License"). You may not use +// Licensed under the Apache License 2.0 (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy // in the file LICENSE in the source distribution or at // https://www.openssl.org/source/license.html @@ -28,6 +28,7 @@ // X-Gene 20.0 (+100%) 12.8 (+300%(***)) // Mongoose 2.36 13.0 (+50%) 8.36 (+33%) // Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) // // (*) Software SHA256 results are of lesser relevance, presented // mostly for informational purposes. @@ -53,27 +54,23 @@ // deliver much less improvement, likely *negative* on Cortex-A5x. // Which is why NEON support is limited to SHA256.] +// $output is the last argument if it looks like a file (it has an extension) +// $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" + +.hidden OPENSSL_armcap_P #endif .text - -.hidden OPENSSL_armcap_P .globl sha512_block_data_order .type sha512_block_data_order,%function .align 6 sha512_block_data_order: #ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA512 b.ne .Lv8_entry #endif @@ -1086,15 +1083,6 @@ sha512_block_data_order: .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .size .LK512,.-.LK512 -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ -.long OPENSSL_armcap_P-. -# else -.quad OPENSSL_armcap_P-. -# endif -#endif .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/sys/crypto/openssl/aarch64/vpaes-armv8.S b/sys/crypto/openssl/aarch64/vpaes-armv8.S index 7ec5b5a83538e..07c08bfe7daac 100644 --- a/sys/crypto/openssl/aarch64/vpaes-armv8.S +++ b/sys/crypto/openssl/aarch64/vpaes-armv8.S @@ -91,12 +91,12 @@ _vpaes_consts: .align 2 .size _vpaes_consts,.-_vpaes_consts .align 6 -## -## _aes_preheat -## -## Fills register %r10 -> .aes_consts (so you can -fPIC) -## and %xmm9-%xmm15 as specified below. -## +// +// _aes_preheat +// +// Fills register %r10 -> .aes_consts (so you can -fPIC) +// and %xmm9-%xmm15 as specified below. +// .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: @@ -108,21 +108,21 @@ _vpaes_encrypt_preheat: ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat -## -## _aes_encrypt_core -## -## AES-encrypt %xmm0. -## -## Inputs: -## %xmm0 = input -## %xmm9-%xmm15 as in _vpaes_preheat -## (%rdx) = scheduled keys -## -## Output in %xmm0 -## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax -## Preserves %xmm6 - %xmm8 so you get some local vectors -## -## +// +// _aes_encrypt_core +// +// AES-encrypt %xmm0. +// +// Inputs: +// %xmm0 = input +// %xmm9-%xmm15 as in _vpaes_preheat +// (%rdx) = scheduled keys +// +// Output in %xmm0 +// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +// Preserves %xmm6 - %xmm8 so you get some local vectors +// +// .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: @@ -328,11 +328,11 @@ _vpaes_decrypt_preheat: ret .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat -## -## Decryption core -## -## Same API as encryption core. -## +// +// Decryption core +// +// Same API as encryption core. +// .type _vpaes_decrypt_core,%function .align 4 _vpaes_decrypt_core: @@ -577,11 +577,11 @@ _vpaes_decrypt_2x: tbl v1.16b, {v8.16b},v2.16b ret .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x -######################################################## -## ## -## AES key schedule ## -## ## -######################################################## +//////////////////////////////////////////////////////// +// // +// AES key schedule // +// // +//////////////////////////////////////////////////////// .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: @@ -637,14 +637,14 @@ _vpaes_schedule_core: b.eq .Lschedule_192 // 128: fall though -## -## .schedule_128 -## -## 128-bit specific part of key schedule. -## -## This schedule is really simple, because all its parts -## are accomplished by the subroutines. -## +// +// .schedule_128 +// +// 128-bit specific part of key schedule. +// +// This schedule is really simple, because all its parts +// are accomplished by the subroutines. +// .Lschedule_128: mov x0, #10 // mov $10, %esi @@ -655,21 +655,21 @@ _vpaes_schedule_core: bl _vpaes_schedule_mangle // write output b .Loop_schedule_128 -## -## .aes_schedule_192 -## -## 192-bit specific part of key schedule. -## -## The main body of this schedule is the same as the 128-bit -## schedule, but with more smearing. The long, high side is -## stored in %xmm7 as before, and the short, low side is in -## the high bits of %xmm6. -## -## This schedule is somewhat nastier, however, because each -## round produces 192 bits of key material, or 1.5 round keys. -## Therefore, on each cycle we do 2 rounds and produce 3 round -## keys. -## +// +// .aes_schedule_192 +// +// 192-bit specific part of key schedule. +// +// The main body of this schedule is the same as the 128-bit +// schedule, but with more smearing. The long, high side is +// stored in %xmm7 as before, and the short, low side is in +// the high bits of %xmm6. +// +// This schedule is somewhat nastier, however, because each +// round produces 192 bits of key material, or 1.5 round keys. +// Therefore, on each cycle we do 2 rounds and produce 3 round +// keys. +// .align 4 .Lschedule_192: sub x0, x0, #8 @@ -693,16 +693,16 @@ _vpaes_schedule_core: bl _vpaes_schedule_192_smear b .Loop_schedule_192 -## -## .aes_schedule_256 -## -## 256-bit specific part of key schedule. -## -## The structure here is very similar to the 128-bit -## schedule, but with an additional "low side" in -## %xmm6. The low side's rounds are the same as the -## high side's, except no rcon and no rotation. -## +// +// .aes_schedule_256 +// +// 256-bit specific part of key schedule. +// +// The structure here is very similar to the 128-bit +// schedule, but with an additional "low side" in +// %xmm6. The low side's rounds are the same as the +// high side's, except no rcon and no rotation. +// .align 4 .Lschedule_256: ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) @@ -729,16 +729,16 @@ _vpaes_schedule_core: b .Loop_schedule_256 -## -## .aes_schedule_mangle_last -## -## Mangler for last round of key schedule -## Mangles %xmm0 -## when encrypting, outputs out(%xmm0) ^ 63 -## when decrypting, outputs unskew(%xmm0) -## -## Always called right before return... jumps to cleanup and exits -## +// +// .aes_schedule_mangle_last +// +// Mangler for last round of key schedule +// Mangles %xmm0 +// when encrypting, outputs out(%xmm0) ^ 63 +// when decrypting, outputs unskew(%xmm0) +// +// Always called right before return... jumps to cleanup and exits +// .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 @@ -772,20 +772,20 @@ _vpaes_schedule_core: ret .size _vpaes_schedule_core,.-_vpaes_schedule_core -## -## .aes_schedule_192_smear -## -## Smear the short, low side in the 192-bit key schedule. -## -## Inputs: -## %xmm7: high side, b a x y -## %xmm6: low side, d c 0 0 -## %xmm13: 0 -## -## Outputs: -## %xmm6: b+c+d b+c 0 0 -## %xmm0: b+c+d b+c b a -## +// +// .aes_schedule_192_smear +// +// Smear the short, low side in the 192-bit key schedule. +// +// Inputs: +// %xmm7: high side, b a x y +// %xmm6: low side, d c 0 0 +// %xmm13: 0 +// +// Outputs: +// %xmm6: b+c+d b+c 0 0 +// %xmm0: b+c+d b+c b a +// .type _vpaes_schedule_192_smear,%function .align 4 _vpaes_schedule_192_smear: @@ -801,24 +801,24 @@ _vpaes_schedule_192_smear: ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -## -## .aes_schedule_round -## -## Runs one main round of the key schedule on %xmm0, %xmm7 -## -## Specifically, runs subbytes on the high dword of %xmm0 -## then rotates it by one byte and xors into the low dword of -## %xmm7. -## -## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -## next rcon. -## -## Smears the dwords of %xmm7 by xoring the low into the -## second low, result into third, result into highest. -## -## Returns results in %xmm7 = %xmm0. -## Clobbers %xmm1-%xmm4, %r11. -## +// +// .aes_schedule_round +// +// Runs one main round of the key schedule on %xmm0, %xmm7 +// +// Specifically, runs subbytes on the high dword of %xmm0 +// then rotates it by one byte and xors into the low dword of +// %xmm7. +// +// Adds rcon from low byte of %xmm8, then rotates %xmm8 for +// next rcon. +// +// Smears the dwords of %xmm7 by xoring the low into the +// second low, result into third, result into highest. +// +// Returns results in %xmm7 = %xmm0. +// Clobbers %xmm1-%xmm4, %r11. +// .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: @@ -866,15 +866,15 @@ _vpaes_schedule_low_round: ret .size _vpaes_schedule_round,.-_vpaes_schedule_round -## -## .aes_schedule_transform -## -## Linear-transform %xmm0 according to tables at (%r11) -## -## Requires that %xmm9 = 0x0F0F... as in preheat -## Output in %xmm0 -## Clobbers %xmm1, %xmm2 -## +// +// .aes_schedule_transform +// +// Linear-transform %xmm0 according to tables at (%r11) +// +// Requires that %xmm9 = 0x0F0F... as in preheat +// Output in %xmm0 +// Clobbers %xmm1, %xmm2 +// .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: @@ -888,29 +888,29 @@ _vpaes_schedule_transform: ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform -## -## .aes_schedule_mangle -## -## Mangle xmm0 from (basis-transformed) standard version -## to our version. -## -## On encrypt, -## xor with 0x63 -## multiply by circulant 0,1,1,1 -## apply shiftrows transform -## -## On decrypt, -## xor with 0x63 -## multiply by "inverse mixcolumns" circulant E,B,D,9 -## deskew -## apply shiftrows transform -## -## -## Writes out to (%rdx), and increments or decrements it -## Keeps track of round number mod 4 in %r8 -## Preserves xmm0 -## Clobbers xmm1-xmm5 -## +// +// .aes_schedule_mangle +// +// Mangle xmm0 from (basis-transformed) standard version +// to our version. +// +// On encrypt, +// xor with 0x63 +// multiply by circulant 0,1,1,1 +// apply shiftrows transform +// +// On decrypt, +// xor with 0x63 +// multiply by "inverse mixcolumns" circulant E,B,D,9 +// deskew +// apply shiftrows transform +// +// +// Writes out to (%rdx), and increments or decrements it +// Keeps track of round number mod 4 in %r8 +// Preserves xmm0 +// Clobbers xmm1-xmm5 +// .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: diff --git a/sys/crypto/openssl/arm/aes-armv4.S b/sys/crypto/openssl/arm/aes-armv4.S index c0c01485aaf92..723b54f4374aa 100644 --- a/sys/crypto/openssl/arm/aes-armv4.S +++ b/sys/crypto/openssl/arm/aes-armv4.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from aes-armv4.pl. */ @ Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -39,13 +39,14 @@ @ Profiler-assisted and platform-specific optimization resulted in 16% @ improvement on Cortex A8 core and ~21.5 cycles per byte. +@ $output is the last argument if it looks like a file (it has an extension) +@ $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ #endif -.text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified .thumb @@ -54,6 +55,8 @@ #undef __thumb2__ #endif +.text + .type AES_Te,%object .align 5 AES_Te: diff --git a/sys/crypto/openssl/arm/aesv8-armx.S b/sys/crypto/openssl/arm/aesv8-armx.S index 569f1c6ba4001..b59badae62afb 100644 --- a/sys/crypto/openssl/arm/aesv8-armx.S +++ b/sys/crypto/openssl/arm/aesv8-armx.S @@ -2,11 +2,18 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 -.text .arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-) .fpu neon +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) .byte c,d|0xc,a,b +#else .code 32 -#undef __thumb2__ +# define INST(a,b,c,d) .byte a,b,c,d +#endif + +.text .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 @@ -49,7 +56,7 @@ aes_v8_set_encrypt_key: vtbl.8 d21,{q3},d5 vext.8 q9,q0,q3,#12 vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 subs r1,r1,#1 veor q3,q3,q9 @@ -68,7 +75,7 @@ aes_v8_set_encrypt_key: vtbl.8 d21,{q3},d5 vext.8 q9,q0,q3,#12 vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 veor q3,q3,q9 vext.8 q9,q0,q9,#12 @@ -83,7 +90,7 @@ aes_v8_set_encrypt_key: vtbl.8 d21,{q3},d5 vext.8 q9,q0,q3,#12 vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 veor q3,q3,q9 vext.8 q9,q0,q9,#12 @@ -115,7 +122,7 @@ aes_v8_set_encrypt_key: #else vst1.32 {d16},[r2]! #endif -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 subs r1,r1,#1 veor q3,q3,q9 @@ -151,7 +158,7 @@ aes_v8_set_encrypt_key: vtbl.8 d21,{q8},d5 vext.8 q9,q0,q3,#12 vst1.32 {q8},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 subs r1,r1,#1 veor q3,q3,q9 @@ -167,7 +174,7 @@ aes_v8_set_encrypt_key: vdup.32 q10,d7[1] vext.8 q9,q0,q8,#12 -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + INST(0x00,0x43,0xf0,0xf3) @ aese q10,q0 veor q8,q8,q9 vext.8 q9,q0,q9,#12 @@ -210,15 +217,15 @@ aes_v8_set_decrypt_key: .Loop_imc: vld1.32 {q0},[r2] vld1.32 {q1},[r0] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 vst1.32 {q0},[r0],r4 vst1.32 {q1},[r2]! cmp r0,r2 bhi .Loop_imc vld1.32 {q0},[r2] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 vst1.32 {q0},[r0] eor r0,r0,r0 @ return value @@ -236,19 +243,19 @@ aes_v8_encrypt: vld1.32 {q1},[r2]! .Loop_enc: -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + INST(0x00,0x43,0xb0,0xf3) @ aese q2,q0 + INST(0x84,0x43,0xb0,0xf3) @ aesmc q2,q2 vld1.32 {q0},[r2]! subs r3,r3,#2 -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + INST(0x02,0x43,0xb0,0xf3) @ aese q2,q1 + INST(0x84,0x43,0xb0,0xf3) @ aesmc q2,q2 vld1.32 {q1},[r2]! bgt .Loop_enc -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + INST(0x00,0x43,0xb0,0xf3) @ aese q2,q0 + INST(0x84,0x43,0xb0,0xf3) @ aesmc q2,q2 vld1.32 {q0},[r2] -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + INST(0x02,0x43,0xb0,0xf3) @ aese q2,q1 veor q2,q2,q0 vst1.8 {q2},[r1] @@ -265,24 +272,336 @@ aes_v8_decrypt: vld1.32 {q1},[r2]! .Loop_dec: -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + INST(0x40,0x43,0xb0,0xf3) @ aesd q2,q0 + INST(0xc4,0x43,0xb0,0xf3) @ aesimc q2,q2 vld1.32 {q0},[r2]! subs r3,r3,#2 -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + INST(0x42,0x43,0xb0,0xf3) @ aesd q2,q1 + INST(0xc4,0x43,0xb0,0xf3) @ aesimc q2,q2 vld1.32 {q1},[r2]! bgt .Loop_dec -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + INST(0x40,0x43,0xb0,0xf3) @ aesd q2,q0 + INST(0xc4,0x43,0xb0,0xf3) @ aesimc q2,q2 vld1.32 {q0},[r2] -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + INST(0x42,0x43,0xb0,0xf3) @ aesd q2,q1 veor q2,q2,q0 vst1.8 {q2},[r1] bx lr .size aes_v8_decrypt,.-aes_v8_decrypt +.globl aes_v8_ecb_encrypt +.type aes_v8_ecb_encrypt,%function +.align 5 +aes_v8_ecb_encrypt: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load remaining args + subs r2,r2,#16 + mov r8,#16 + blo .Lecb_done + it eq + moveq r8,#0 + + cmp r4,#0 @ en- or decrypting? + ldr r5,[r3,#240] + and r2,r2,#-16 + vld1.8 {q0},[r0],r8 + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10,q11},[r7]! + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + beq .Lecb_dec + + vld1.8 {q1},[r0]! + subs r2,r2,#32 @ bias + add r6,r5,#2 + vorr q3,q1,q1 + vorr q10,q1,q1 + vorr q1,q0,q0 + blo .Lecb_enc_tail + + vorr q1,q3,q3 + vld1.8 {q10},[r0]! +.Loop3x_ecb_enc: + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q9},[r7]! + bgt .Loop3x_ecb_enc + + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + subs r2,r2,#0x30 + it lo + movlo r6,r2 @ r6, r6, is zero at this point + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + add r0,r0,r6 @ r0 is adjusted in such way that + @ at exit from the loop q1-q10 + @ are loaded with last "words" + mov r7,r3 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x28,0x23,0xb0,0xf3) @ aese q1,q12 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x28,0x43,0xf0,0xf3) @ aese q10,q12 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.8 {q2},[r0]! + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2a,0x23,0xb0,0xf3) @ aese q1,q13 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2a,0x43,0xf0,0xf3) @ aese q10,q13 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.8 {q3},[r0]! + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2c,0x23,0xb0,0xf3) @ aese q1,q14 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2c,0x43,0xf0,0xf3) @ aese q10,q14 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.8 {q11},[r0]! + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 + INST(0x2e,0x23,0xb0,0xf3) @ aese q1,q15 + INST(0x2e,0x43,0xf0,0xf3) @ aese q10,q15 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + add r6,r5,#2 + veor q4,q7,q0 + veor q5,q7,q1 + veor q10,q10,q7 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q4},[r1]! + vorr q0,q2,q2 + vst1.8 {q5},[r1]! + vorr q1,q3,q3 + vst1.8 {q10},[r1]! + vorr q10,q11,q11 + bhs .Loop3x_ecb_enc + + cmn r2,#0x30 + beq .Lecb_done + nop + +.Lecb_enc_tail: + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + vld1.32 {q9},[r7]! + bgt .Lecb_enc_tail + + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x28,0x23,0xb0,0xf3) @ aese q1,q12 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x28,0x43,0xf0,0xf3) @ aese q10,q12 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + cmn r2,#0x20 + INST(0x2a,0x23,0xb0,0xf3) @ aese q1,q13 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2a,0x43,0xf0,0xf3) @ aese q10,q13 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x2c,0x23,0xb0,0xf3) @ aese q1,q14 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x2c,0x43,0xf0,0xf3) @ aese q10,q14 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 + INST(0x2e,0x23,0xb0,0xf3) @ aese q1,q15 + INST(0x2e,0x43,0xf0,0xf3) @ aese q10,q15 + beq .Lecb_enc_one + veor q5,q7,q1 + veor q9,q7,q10 + vst1.8 {q5},[r1]! + vst1.8 {q9},[r1]! + b .Lecb_done + +.Lecb_enc_one: + veor q5,q7,q10 + vst1.8 {q5},[r1]! + b .Lecb_done +.align 5 +.Lecb_dec: + vld1.8 {q1},[r0]! + subs r2,r2,#32 @ bias + add r6,r5,#2 + vorr q3,q1,q1 + vorr q10,q1,q1 + vorr q1,q0,q0 + blo .Lecb_dec_tail + + vorr q1,q3,q3 + vld1.8 {q10},[r0]! +.Loop3x_ecb_dec: + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt .Loop3x_ecb_dec + + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + subs r2,r2,#0x30 + it lo + movlo r6,r2 @ r6, r6, is zero at this point + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + add r0,r0,r6 @ r0 is adjusted in such way that + @ at exit from the loop q1-q10 + @ are loaded with last "words" + mov r7,r3 + INST(0x68,0x03,0xb0,0xf3) @ aesd q0,q12 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.8 {q2},[r0]! + INST(0x6a,0x03,0xb0,0xf3) @ aesd q0,q13 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.8 {q3},[r0]! + INST(0x6c,0x03,0xb0,0xf3) @ aesd q0,q14 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.8 {q11},[r0]! + INST(0x6e,0x03,0xb0,0xf3) @ aesd q0,q15 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + add r6,r5,#2 + veor q4,q7,q0 + veor q5,q7,q1 + veor q10,q10,q7 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q4},[r1]! + vorr q0,q2,q2 + vst1.8 {q5},[r1]! + vorr q1,q3,q3 + vst1.8 {q10},[r1]! + vorr q10,q11,q11 + bhs .Loop3x_ecb_dec + + cmn r2,#0x30 + beq .Lecb_done + nop + +.Lecb_dec_tail: + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt .Lecb_dec_tail + + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + cmn r2,#0x20 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 + beq .Lecb_dec_one + veor q5,q7,q1 + veor q9,q7,q10 + vst1.8 {q5},[r1]! + vst1.8 {q9},[r1]! + b .Lecb_done + +.Lecb_dec_one: + veor q5,q7,q10 + vst1.8 {q5},[r1]! + +.Lecb_done: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,pc} +.size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt .globl aes_v8_cbc_encrypt .type aes_v8_cbc_encrypt,%function .align 5 @@ -294,6 +613,7 @@ aes_v8_cbc_encrypt: subs r2,r2,#16 mov r8,#16 blo .Lcbc_abort + it eq moveq r8,#0 cmp r5,#0 @ en- or decrypting? @@ -324,58 +644,59 @@ aes_v8_cbc_encrypt: add r7,r3,#16 add r6,r3,#16*4 add r12,r3,#16*5 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 add r14,r3,#16*6 add r3,r3,#16*7 b .Lenter_cbc_enc .align 4 .Loop_cbc_enc: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vst1.8 {q6},[r1]! .Lenter_cbc_enc: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x04,0x03,0xb0,0xf3) @ aese q0,q2 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q8},[r6] cmp r5,#4 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x06,0x03,0xb0,0xf3) @ aese q0,q3 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q9},[r12] beq .Lcbc_enc192 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q8},[r14] -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q9},[r3] nop .Lcbc_enc192: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 subs r2,r2,#16 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + it eq moveq r8,#0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x24,0x03,0xb0,0xf3) @ aese q0,q10 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x26,0x03,0xb0,0xf3) @ aese q0,q11 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 veor q8,q8,q5 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.32 {q9},[r7] @ re-pre-load rndkey[1] -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 veor q6,q0,q7 bhs .Loop_cbc_enc @@ -385,35 +706,36 @@ aes_v8_cbc_encrypt: .align 5 .Lcbc_enc128: vld1.32 {q2,q3},[r7] -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 b .Lenter_cbc_enc128 .Loop_cbc_enc128: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vst1.8 {q6},[r1]! .Lenter_cbc_enc128: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 subs r2,r2,#16 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x04,0x03,0xb0,0xf3) @ aese q0,q2 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + it eq moveq r8,#0 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x06,0x03,0xb0,0xf3) @ aese q0,q3 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x24,0x03,0xb0,0xf3) @ aese q0,q10 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x26,0x03,0xb0,0xf3) @ aese q0,q11 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 veor q8,q8,q5 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 veor q6,q0,q7 bhs .Loop_cbc_enc128 @@ -434,71 +756,71 @@ aes_v8_cbc_encrypt: vorr q2,q0,q0 vorr q3,q1,q1 vorr q11,q10,q10 - .Loop3x_cbc_dec: -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q9},[r7]! bgt .Loop3x_cbc_dec -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x03,0xb0,0xf3) @ aesd q0,q8 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q4,q6,q7 subs r2,r2,#0x30 veor q5,q2,q7 + it lo movlo r6,r2 @ r6, r6, is zero at this point -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x62,0x03,0xb0,0xf3) @ aesd q0,q9 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q9,q3,q7 add r0,r0,r6 @ r0 is adjusted in such way that @ at exit from the loop q1-q10 @ are loaded with last "words" vorr q6,q11,q11 mov r7,r3 -.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x68,0x03,0xb0,0xf3) @ aesd q0,q12 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.8 {q2},[r0]! -.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6a,0x03,0xb0,0xf3) @ aesd q0,q13 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.8 {q3},[r0]! -.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6c,0x03,0xb0,0xf3) @ aesd q0,q14 + INST(0xc0,0x03,0xb0,0xf3) @ aesimc q0,q0 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.8 {q11},[r0]! -.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + INST(0x6e,0x03,0xb0,0xf3) @ aesd q0,q15 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] add r6,r5,#2 veor q4,q4,q0 @@ -518,44 +840,44 @@ aes_v8_cbc_encrypt: nop .Lcbc_dec_tail: -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 vld1.32 {q9},[r7]! bgt .Lcbc_dec_tail -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x60,0x23,0xb0,0xf3) @ aesd q1,q8 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x60,0x43,0xf0,0xf3) @ aesd q10,q8 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x62,0x23,0xb0,0xf3) @ aesd q1,q9 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x62,0x43,0xf0,0xf3) @ aesd q10,q9 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 + INST(0x68,0x23,0xb0,0xf3) @ aesd q1,q12 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x68,0x43,0xf0,0xf3) @ aesd q10,q12 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 cmn r2,#0x20 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6a,0x23,0xb0,0xf3) @ aesd q1,q13 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6a,0x43,0xf0,0xf3) @ aesd q10,q13 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q5,q6,q7 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + INST(0x6c,0x23,0xb0,0xf3) @ aesd q1,q14 + INST(0xc2,0x23,0xb0,0xf3) @ aesimc q1,q1 + INST(0x6c,0x43,0xf0,0xf3) @ aesd q10,q14 + INST(0xe4,0x43,0xf0,0xf3) @ aesimc q10,q10 veor q9,q3,q7 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + INST(0x6e,0x23,0xb0,0xf3) @ aesd q1,q15 + INST(0x6e,0x43,0xf0,0xf3) @ aesd q10,q15 beq .Lcbc_dec_one veor q5,q5,q1 veor q9,q9,q10 @@ -602,6 +924,7 @@ aes_v8_ctr32_encrypt_blocks: vld1.32 {q7},[r7] add r7,r3,#32 mov r6,r5 + it lo movlo r12,#0 #ifndef __ARMEB__ rev r8, r8 @@ -621,76 +944,76 @@ aes_v8_ctr32_encrypt_blocks: .align 4 .Loop3x_ctr32: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 vld1.32 {q9},[r7]! bgt .Loop3x_ctr32 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x83,0xb0,0xf3) @ aesmc q4,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0xa3,0xb0,0xf3) @ aesmc q5,q1 vld1.8 {q2},[r0]! add r9,r8,#1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + INST(0x20,0x43,0xf0,0xf3) @ aese q10,q8 + INST(0xa4,0x43,0xf0,0xf3) @ aesmc q10,q10 vld1.8 {q3},[r0]! rev r9,r9 -.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x22,0x83,0xb0,0xf3) @ aese q4,q9 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 + INST(0x22,0xa3,0xb0,0xf3) @ aese q5,q9 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 vld1.8 {q11},[r0]! mov r7,r3 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 -.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x22,0x43,0xf0,0xf3) @ aese q10,q9 + INST(0xa4,0x23,0xf0,0xf3) @ aesmc q9,q10 + INST(0x28,0x83,0xb0,0xf3) @ aese q4,q12 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 + INST(0x28,0xa3,0xb0,0xf3) @ aese q5,q12 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 veor q2,q2,q7 add r10,r8,#2 -.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + INST(0x28,0x23,0xf0,0xf3) @ aese q9,q12 + INST(0xa2,0x23,0xf0,0xf3) @ aesmc q9,q9 veor q3,q3,q7 add r8,r8,#3 -.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x2a,0x83,0xb0,0xf3) @ aese q4,q13 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 + INST(0x2a,0xa3,0xb0,0xf3) @ aese q5,q13 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 veor q11,q11,q7 vmov.32 d13[1], r9 -.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + INST(0x2a,0x23,0xf0,0xf3) @ aese q9,q13 + INST(0xa2,0x23,0xf0,0xf3) @ aesmc q9,q9 vorr q0,q6,q6 rev r10,r10 -.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + INST(0x2c,0x83,0xb0,0xf3) @ aese q4,q14 + INST(0x88,0x83,0xb0,0xf3) @ aesmc q4,q4 vmov.32 d13[1], r10 rev r12,r8 -.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + INST(0x2c,0xa3,0xb0,0xf3) @ aese q5,q14 + INST(0x8a,0xa3,0xb0,0xf3) @ aesmc q5,q5 vorr q1,q6,q6 vmov.32 d13[1], r12 -.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + INST(0x2c,0x23,0xf0,0xf3) @ aese q9,q14 + INST(0xa2,0x23,0xf0,0xf3) @ aesmc q9,q9 vorr q10,q6,q6 subs r2,r2,#3 -.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 -.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 -.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 + INST(0x2e,0x83,0xb0,0xf3) @ aese q4,q15 + INST(0x2e,0xa3,0xb0,0xf3) @ aese q5,q15 + INST(0x2e,0x23,0xf0,0xf3) @ aese q9,q15 veor q2,q2,q4 vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] @@ -707,48 +1030,49 @@ aes_v8_ctr32_encrypt_blocks: beq .Lctr32_done cmp r2,#1 mov r12,#16 + it eq moveq r12,#0 .Lctr32_tail: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.32 {q8},[r7]! subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.32 {q9},[r7]! bgt .Lctr32_tail -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x20,0x03,0xb0,0xf3) @ aese q0,q8 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x20,0x23,0xb0,0xf3) @ aese q1,q8 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 + INST(0x22,0x03,0xb0,0xf3) @ aese q0,q9 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x22,0x23,0xb0,0xf3) @ aese q1,q9 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.8 {q2},[r0],r12 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x28,0x03,0xb0,0xf3) @ aese q0,q12 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x28,0x23,0xb0,0xf3) @ aese q1,q12 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 vld1.8 {q3},[r0] -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x2a,0x03,0xb0,0xf3) @ aese q0,q13 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2a,0x23,0xb0,0xf3) @ aese q1,q13 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 veor q2,q2,q7 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + INST(0x2c,0x03,0xb0,0xf3) @ aese q0,q14 + INST(0x80,0x03,0xb0,0xf3) @ aesmc q0,q0 + INST(0x2c,0x23,0xb0,0xf3) @ aese q1,q14 + INST(0x82,0x23,0xb0,0xf3) @ aesmc q1,q1 veor q3,q3,q7 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 -.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 + INST(0x2e,0x03,0xb0,0xf3) @ aese q0,q15 + INST(0x2e,0x23,0xb0,0xf3) @ aese q1,q15 cmp r2,#1 veor q2,q2,q0 diff --git a/sys/crypto/openssl/arm/armv4-gf2m.S b/sys/crypto/openssl/arm/armv4-gf2m.S index 5ee07d4bf9a35..ccf6212f6eb10 100644 --- a/sys/crypto/openssl/arm/armv4-gf2m.S +++ b/sys/crypto/openssl/arm/armv4-gf2m.S @@ -1,13 +1,14 @@ /* Do not modify. This file is auto-generated from armv4-gf2m.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif + +.text .type mul_1x1_ialu,%function .align 5 mul_1x1_ialu: @@ -100,11 +101,13 @@ bn_GF2m_mul_2x2: #if __ARM_MAX_ARCH__>=7 stmdb sp!,{r10,lr} ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) adr r10,.LOPENSSL_armcap ldr r12,[r12,r10] -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV7_NEON itt ne ldrne r10,[sp],#8 @@ -218,7 +221,11 @@ bn_GF2m_mul_2x2: #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-. +# endif #endif .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 diff --git a/sys/crypto/openssl/arm/armv4-mont.S b/sys/crypto/openssl/arm/armv4-mont.S index 69ca04f891192..460d5cb400cb8 100644 --- a/sys/crypto/openssl/arm/armv4-mont.S +++ b/sys/crypto/openssl/arm/armv4-mont.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from armv4-mont.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -9,10 +8,16 @@ .code 32 #endif +.text + #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lbn_mul_mont +# endif #endif .globl bn_mul_mont @@ -26,12 +31,14 @@ bn_mul_mont: #if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu - adr r0,.Lbn_mul_mont - ldr r2,.LOPENSSL_armcap + ldr r0,.LOPENSSL_armcap +#if !defined(_WIN32) + adr r2,.Lbn_mul_mont ldr r0,[r0,r2] -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r0,[r0] -#endif +# endif tst r0,#ARMV7_NEON @ NEON available? ldmia sp, {r0,r2} beq .Lialu diff --git a/sys/crypto/openssl/arm/armv4cpuid.S b/sys/crypto/openssl/arm/armv4cpuid.S index f1f59a2de10ec..8436d1f079c4f 100644 --- a/sys/crypto/openssl/arm/armv4cpuid.S +++ b/sys/crypto/openssl/arm/armv4cpuid.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from armv4cpuid.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified .thumb @@ -10,6 +9,8 @@ #undef __thumb2__ #endif +.text + .align 5 .globl OPENSSL_atomic_add .type OPENSSL_atomic_add,%function diff --git a/sys/crypto/openssl/arm/bsaes-armv7.S b/sys/crypto/openssl/arm/bsaes-armv7.S index 2414da194a228..91261fc0b69d2 100644 --- a/sys/crypto/openssl/arm/bsaes-armv7.S +++ b/sys/crypto/openssl/arm/bsaes-armv7.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from bsaes-armv7.pl. */ -@ Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. +@ Copyright 2012-2023 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -14,7 +14,7 @@ @ details see http://www.openssl.org/~appro/cryptogams/. @ @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ of Linaro. Permission to use under GPL terms is granted. +@ of Linaro. @ ==================================================================== @ Bit-sliced AES for ARM NEON @@ -50,6 +50,8 @@ @ April-August 2013 @ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. +@ $output is the last argument if it looks like a file (it has an extension) +@ $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" @@ -74,7 +76,6 @@ .arch armv7-a .fpu neon -.text .syntax unified @ ARMv7-capable assembler is expected to handle this #if defined(__thumb2__) && !defined(__APPLE__) .thumb @@ -83,6 +84,8 @@ # undef __thumb2__ #endif +.text + .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: @@ -1071,18 +1074,18 @@ _bsaes_key_convert: -.globl bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,%function +.globl ossl_bsaes_cbc_encrypt +.type ossl_bsaes_cbc_encrypt,%function .align 5 -bsaes_cbc_encrypt: +ossl_bsaes_cbc_encrypt: #ifndef __KERNEL__ cmp r2, #128 #ifndef __thumb__ blo AES_cbc_encrypt #else - bhs 1f + bhs .Lcbc_do_bsaes b AES_cbc_encrypt -1: +.Lcbc_do_bsaes: #endif #endif @@ -1336,12 +1339,12 @@ bsaes_cbc_encrypt: vst1.8 {q15}, [r8] @ return IV VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt +.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt -.globl bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,%function +.globl ossl_bsaes_ctr32_encrypt_blocks +.type ossl_bsaes_ctr32_encrypt_blocks,%function .align 5 -bsaes_ctr32_encrypt_blocks: +ossl_bsaes_ctr32_encrypt_blocks: cmp r2, #8 @ use plain AES for blo .Lctr_enc_short @ small sizes @@ -1564,11 +1567,11 @@ bsaes_ctr32_encrypt_blocks: vstmia sp!, {q0,q1} ldmia sp!, {r4,r5,r6,r7,r8, pc} -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.type bsaes_xts_encrypt,%function +.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks +.globl ossl_bsaes_xts_encrypt +.type ossl_bsaes_xts_encrypt,%function .align 4 -bsaes_xts_encrypt: +ossl_bsaes_xts_encrypt: mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 VFP_ABI_PUSH @@ -2043,12 +2046,12 @@ bsaes_xts_encrypt: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt +.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt -.globl bsaes_xts_decrypt -.type bsaes_xts_decrypt,%function +.globl ossl_bsaes_xts_decrypt +.type ossl_bsaes_xts_decrypt,%function .align 4 -bsaes_xts_decrypt: +ossl_bsaes_xts_decrypt: mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} @ 0x20 VFP_ABI_PUSH @@ -2554,5 +2557,5 @@ bsaes_xts_decrypt: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt #endif diff --git a/sys/crypto/openssl/arm/chacha-armv4.S b/sys/crypto/openssl/arm/chacha-armv4.S index 077500d3c188f..9b673c4c981e4 100644 --- a/sys/crypto/openssl/arm/chacha-armv4.S +++ b/sys/crypto/openssl/arm/chacha-armv4.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from chacha-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) || defined(__clang__) .syntax unified #endif @@ -15,6 +14,8 @@ #define ldrhsb ldrbhs #endif +.text + .align 5 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral @@ -22,7 +23,11 @@ .long 1,0,0,0 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.LChaCha20_ctr32 +# endif #else .word -1 #endif @@ -49,8 +54,10 @@ ChaCha20_ctr32: cmp r2,#192 @ test len bls .Lshort ldr r4,[r14,#-32] +# if !defined(_WIN32) ldr r4,[r14,r4] -# ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r4,[r4] # endif tst r4,#ARMV7_NEON diff --git a/sys/crypto/openssl/arm/ecp_nistz256-armv4.S b/sys/crypto/openssl/arm/ecp_nistz256-armv4.S index 150b83583dea2..90338c24f733c 100644 --- a/sys/crypto/openssl/arm/ecp_nistz256-armv4.S +++ b/sys/crypto/openssl/arm/ecp_nistz256-armv4.S @@ -1,13 +1,13 @@ /* Do not modify. This file is auto-generated from ecp_nistz256-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif +.section .rodata .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,%object .align 12 @@ -2381,6 +2381,8 @@ ecp_nistz256_precomputed: .byte 0xec,0xf0,0x42,0x88,0xd0,0x81,0x51,0xf9,0x1b,0xbc,0x43,0xa4,0x37,0xf1,0xd7,0x90,0x21,0x7e,0xa0,0x3e,0x63,0xfb,0x21,0xfa,0x12,0xfb,0xde,0xc7,0xbf,0xb3,0x58,0xe7,0x76,0x42,0x20,0x01,0x3d,0x66,0x80,0xf1,0xb8,0xaf,0xfa,0x7d,0x96,0x89,0x36,0x48,0x95,0xd9,0x6e,0x6d,0xe6,0x4f,0xff,0x2a,0x47,0x61,0xf2,0x04,0xb7,0x83,0x14,0xce .byte 0x0a,0x3c,0x73,0x17,0x50,0x88,0x03,0x25,0x4a,0xe3,0x13,0x55,0x8b,0x7e,0x50,0x38,0xfc,0x14,0x0b,0x04,0x8e,0xa8,0x5b,0xd6,0x72,0x20,0x60,0xe9,0xaa,0x22,0x82,0x11,0xc6,0xc4,0xd7,0xb9,0xc8,0x0c,0x7e,0x05,0xfb,0x90,0xe4,0x9c,0x28,0x89,0x29,0x99,0x63,0x4d,0xec,0x7b,0x50,0xbd,0xd8,0xa3,0x5b,0x50,0x77,0x19,0x81,0x92,0xce,0x82 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed + +.text .align 5 .LRR:@ 2^512 mod P precomputed for NIST P256 polynomial .long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb diff --git a/sys/crypto/openssl/arm/ghash-armv4.S b/sys/crypto/openssl/arm/ghash-armv4.S index 413ab2ffed1d6..e665ba66797fa 100644 --- a/sys/crypto/openssl/arm/ghash-armv4.S +++ b/sys/crypto/openssl/arm/ghash-armv4.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from ghash-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) || defined(__clang__) .syntax unified #define ldrplb ldrbpl @@ -13,6 +12,8 @@ .code 32 #endif +.text + .type rem_4bit,%object .align 5 rem_4bit: diff --git a/sys/crypto/openssl/arm/ghashv8-armx.S b/sys/crypto/openssl/arm/ghashv8-armx.S index f617d99814301..3234cc9ba497a 100644 --- a/sys/crypto/openssl/arm/ghashv8-armx.S +++ b/sys/crypto/openssl/arm/ghashv8-armx.S @@ -2,10 +2,17 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 -.text .fpu neon +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) .byte c,0xef,a,b +#else .code 32 -#undef __thumb2__ +# define INST(a,b,c,d) .byte a,b,c,0xf2 +#endif + +.text .globl gcm_init_v8 .type gcm_init_v8,%function .align 4 @@ -29,23 +36,23 @@ gcm_init_v8: @ calculate H^2 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing -.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 + INST(0xa8,0x0e,0xa8,0xf2) @ pmull q0,q12,q12 veor q8,q8,q12 -.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 -.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 + INST(0xa9,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q12 + INST(0xa0,0x2e,0xa0,0xf2) @ pmull q1,q8,q8 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl veor q0,q1,q10 vext.8 q10,q0,q0,#8 @ 2nd phase -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q10,q10,q2 veor q14,q0,q10 @@ -68,23 +75,23 @@ gcm_gmult_v8: #endif vext.8 q3,q9,q9,#8 -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + INST(0x86,0x0e,0xa8,0xf2) @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + INST(0x87,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q3 @ H.hi·Xi.hi + INST(0xa2,0x2e,0xaa,0xf2) @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl veor q0,q1,q10 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q10,q10,q2 veor q0,q0,q10 @@ -120,6 +127,7 @@ gcm_ghash_v8: vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 vmov.i8 q11,#0xe1 vld1.64 {q14},[r1] + it eq moveq r12,#0 @ is it time to zero r12? vext.8 q0,q0,q0,#8 @ rotate Xi vld1.64 {q8},[r2]! @ load [rotated] I[0] @@ -136,26 +144,28 @@ gcm_ghash_v8: #endif vext.8 q7,q9,q9,#8 veor q3,q3,q0 @ I[i]^=Xi -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + INST(0x8e,0x8e,0xa8,0xf2) @ pmull q4,q12,q7 @ H·Ii+1 veor q9,q9,q7 @ Karatsuba pre-processing -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + INST(0x8f,0xce,0xa9,0xf2) @ pmull2 q6,q12,q7 b .Loop_mod2x_v8 .align 4 .Loop_mod2x_v8: vext.8 q10,q3,q3,#8 subs r3,r3,#32 @ is there more data? -.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + INST(0x86,0x0e,0xac,0xf2) @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + it lo movlo r12,#0 @ is it time to zero r12? -.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 + INST(0xa2,0xae,0xaa,0xf2) @ pmull q5,q13,q9 veor q10,q10,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi + INST(0x87,0x4e,0xad,0xf2) @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi veor q0,q0,q4 @ accumulate -.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + INST(0xa5,0x2e,0xab,0xf2) @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] veor q2,q2,q6 + it eq moveq r12,#0 @ is it time to zero r12? veor q1,q1,q5 @@ -167,7 +177,7 @@ gcm_ghash_v8: vrev64.8 q8,q8 #endif veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction #ifndef __ARMEB__ vrev64.8 q9,q9 @@ -177,15 +187,15 @@ gcm_ghash_v8: vext.8 q7,q9,q9,#8 vext.8 q3,q8,q8,#8 veor q0,q1,q10 -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + INST(0x8e,0x8e,0xa8,0xf2) @ pmull q4,q12,q7 @ H·Ii+1 veor q3,q3,q2 @ accumulate q3 early vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q3,q3,q10 veor q9,q9,q7 @ Karatsuba pre-processing veor q3,q3,q0 -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + INST(0x8f,0xce,0xa9,0xf2) @ pmull2 q6,q12,q7 bhs .Loop_mod2x_v8 @ there was at least 32 more bytes veor q2,q2,q10 @@ -198,23 +208,23 @@ gcm_ghash_v8: veor q3,q3,q0 @ inp^=Xi veor q9,q8,q10 @ q9 is rotated inp^Xi -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + INST(0x86,0x0e,0xa8,0xf2) @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + INST(0x87,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q3 @ H.hi·Xi.hi + INST(0xa2,0x2e,0xaa,0xf2) @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 veor q1,q1,q9 veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction vmov d4,d3 @ Xh|Xm - 256-bit result vmov d3,d0 @ Xm is rotated Xl veor q0,q1,q10 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 veor q10,q10,q2 veor q0,q0,q10 diff --git a/sys/crypto/openssl/arm/keccak1600-armv4.S b/sys/crypto/openssl/arm/keccak1600-armv4.S index ae0cd9cd4b8b3..ddad05acad647 100644 --- a/sys/crypto/openssl/arm/keccak1600-armv4.S +++ b/sys/crypto/openssl/arm/keccak1600-armv4.S @@ -1,8 +1,6 @@ /* Do not modify. This file is auto-generated from keccak1600-armv4.pl. */ #include "arm_arch.h" -.text - #if defined(__thumb2__) .syntax unified .thumb @@ -10,6 +8,8 @@ .code 32 #endif +.text + .type iotas32, %object .align 5 iotas32: @@ -1826,7 +1826,14 @@ KeccakF1600_enter: #endif blo .Lround2x +#if __ARM_ARCH__>=5 ldr pc,[sp,#440] +#else + ldr lr,[sp,#440] + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size KeccakF1600_int,.-KeccakF1600_int .type KeccakF1600, %function @@ -1865,7 +1872,14 @@ KeccakF1600: stmia r11, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} add sp,sp,#440+20 +#if __ARM_ARCH__>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size KeccakF1600,.-KeccakF1600 .globl SHA3_absorb .type SHA3_absorb,%function @@ -2011,7 +2025,14 @@ SHA3_absorb: .Labsorb_abort: add sp,sp,#456+32 mov r0,r12 @ return value +#if __ARM_ARCH__>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size SHA3_absorb,.-SHA3_absorb .globl SHA3_squeeze .type SHA3_squeeze,%function @@ -2156,7 +2177,14 @@ SHA3_squeeze: .align 4 .Lsqueeze_done: add sp,sp,#24 +#if __ARM_ARCH__>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif .size SHA3_squeeze,.-SHA3_squeeze #if __ARM_MAX_ARCH__>=7 .fpu neon @@ -2362,7 +2390,7 @@ KeccakF1600_neon: subs r3, r3, #1 bne .Loop_neon -.word 0xe12fff1e + bx lr .size KeccakF1600_neon,.-KeccakF1600_neon .globl SHA3_absorb_neon diff --git a/sys/crypto/openssl/arm/poly1305-armv4.S b/sys/crypto/openssl/arm/poly1305-armv4.S index 64d0ef9b5a38a..41b7caf06cc66 100644 --- a/sys/crypto/openssl/arm/poly1305-armv4.S +++ b/sys/crypto/openssl/arm/poly1305-armv4.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from poly1305-armv4.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -9,6 +8,8 @@ .code 32 #endif +.text + .globl poly1305_emit .globl poly1305_blocks .globl poly1305_init @@ -53,8 +54,10 @@ poly1305_init: and r4,r4,r10 #if __ARM_MAX_ARCH__>=7 +# if !defined(_WIN32) ldr r12,[r11,r12] @ OPENSSL_armcap_P -# ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] # endif #endif @@ -69,32 +72,22 @@ poly1305_init: #if __ARM_MAX_ARCH__>=7 tst r12,#ARMV7_NEON @ check for NEON -# ifdef __APPLE__ - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks -# ifdef __thumb2__ - it ne -# endif +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + adr r12,.Lpoly1305_emit + adr r10,.Lpoly1305_emit_neon + itt ne movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon -# ifdef __thumb2__ - it ne -# endif movne r12,r10 + orr r11,r11,#1 @ thumb-ify address + orr r12,r12,#1 # else -# ifdef __thumb2__ - itete eq -# endif addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) # endif -# ifdef __thumb2__ - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 -# endif #endif ldrb r9,[r1,#11] orr r6,r6,r7,lsl#8 @@ -1162,7 +1155,11 @@ poly1305_emit_neon: .Lzeros: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lpoly1305_init +# endif #endif .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 diff --git a/sys/crypto/openssl/arm/sha1-armv4-large.S b/sys/crypto/openssl/arm/sha1-armv4-large.S index 2492c049a2a0a..c23e1df6e38ed 100644 --- a/sys/crypto/openssl/arm/sha1-armv4-large.S +++ b/sys/crypto/openssl/arm/sha1-armv4-large.S @@ -1,7 +1,6 @@ /* Do not modify. This file is auto-generated from sha1-armv4-large.pl. */ #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -9,6 +8,8 @@ .code 32 #endif +.text + .globl sha1_block_data_order .type sha1_block_data_order,%function @@ -16,12 +17,14 @@ sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 .Lsha1_block: - adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) + adr r3,.Lsha1_block ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV8_SHA1 bne .LARMv8 tst r12,#ARMV7_NEON @@ -486,7 +489,11 @@ sha1_block_data_order: .LK_60_79:.word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha1_block +# endif #endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 diff --git a/sys/crypto/openssl/arm/sha256-armv4.S b/sys/crypto/openssl/arm/sha256-armv4.S index 5b541faf07f7d..c1b775f767c8a 100644 --- a/sys/crypto/openssl/arm/sha256-armv4.S +++ b/sys/crypto/openssl/arm/sha256-armv4.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha256-armv4.pl. */ @ Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -44,6 +44,8 @@ @ @ Add ARMv8 code path performing at 2.0 cpb on Apple A7. +@ $output is the last argument if it looks like a file (it has an extension) +@ $flavour is the first argument if it doesn't look like a file #ifndef __KERNEL__ # include "arm_arch.h" #else @@ -51,7 +53,6 @@ # define __ARM_MAX_ARCH__ 7 #endif -.text #if defined(__thumb2__) .syntax unified .thumb @@ -59,6 +60,8 @@ .code 32 #endif +.text + .type K256,%object .align 5 K256: @@ -82,7 +85,11 @@ K256: .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha256_block_data_order +# endif #endif .align 5 @@ -97,10 +104,12 @@ sha256_block_data_order: #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV8_SHA256 bne .LARMv8 tst r12,#ARMV7_NEON diff --git a/sys/crypto/openssl/arm/sha512-armv4.S b/sys/crypto/openssl/arm/sha512-armv4.S index f1522a8cbff73..1f84a8ff93fe9 100644 --- a/sys/crypto/openssl/arm/sha512-armv4.S +++ b/sys/crypto/openssl/arm/sha512-armv4.S @@ -1,7 +1,7 @@ /* Do not modify. This file is auto-generated from sha512-armv4.pl. */ @ Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. @ -@ Licensed under the OpenSSL license (the "License"). You may not use +@ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @ in the file LICENSE in the source distribution or at @ https://www.openssl.org/source/license.html @@ -74,7 +74,6 @@ # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 #endif -.text #if defined(__thumb2__) .syntax unified .thumb @@ -83,6 +82,8 @@ .code 32 #endif +.text + .type K512,%object .align 5 K512: @@ -129,7 +130,11 @@ K512: .size K512,.-K512 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha512_block_data_order +# endif .skip 32-4 #else .skip 32 @@ -146,10 +151,12 @@ sha512_block_data_order: #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV7_NEON bne .LNEON #endif diff --git a/sys/crypto/openssl/i386/aes-586.S b/sys/crypto/openssl/i386/aes-586.S new file mode 100644 index 0000000000000..861ee21e8400e --- /dev/null +++ b/sys/crypto/openssl/i386/aes-586.S @@ -0,0 +1,6644 @@ +/* Do not modify. This file is auto-generated from aes-586.pl. */ +#ifdef PIC +.text +.type _x86_AES_encrypt_compact,@function +.align 16 +_x86_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L000loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ecx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ecx,%edi + xorl %esi,%ecx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ecx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%ecx + andl %edx,%ebp + leal (%edx,%edx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %edx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %edx,%edi + xorl %esi,%edx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%edx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%edx + andl %eax,%ebp + leal (%eax,%eax,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %eax,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %eax,%edi + xorl %esi,%eax + rorl $24,%edi + xorl %ebp,%esi + roll $24,%eax + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%eax + andl %ebx,%ebp + leal (%ebx,%ebx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ebx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ebx,%edi + xorl %esi,%ebx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ebx + xorl %edi,%esi + xorl %esi,%ebx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L000loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact +.type _sse_AES_encrypt_compact,@function +.align 16 +_sse_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L001loop: + pshufw $8,%mm0,%mm1 + pshufw $13,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $13,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $8,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $8,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + movd %mm2,%eax + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + movd %mm6,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $8,%esi + shrl $16,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shrl $16,%eax + movd %ecx,%mm1 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + andl $255,%eax + orl %esi,%ecx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + andl $255,%ebx + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%ecx + shll $16,%eax + movzbl -128(%ebp,%edi,1),%esi + orl %eax,%edx + shll $8,%esi + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + orl %ebx,%edx + movl 20(%esp),%edi + movd %ecx,%mm4 + movd %edx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L002out + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + movq %mm0,%mm1 + movq %mm4,%mm5 + pcmpgtb %mm0,%mm3 + pcmpgtb %mm4,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + pshufw $177,%mm0,%mm2 + pshufw $177,%mm4,%mm6 + paddb %mm0,%mm0 + paddb %mm4,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pshufw $177,%mm2,%mm3 + pshufw $177,%mm6,%mm7 + pxor %mm0,%mm1 + pxor %mm4,%mm5 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm3,%mm2 + movq %mm7,%mm6 + pslld $8,%mm3 + pslld $8,%mm7 + psrld $24,%mm2 + psrld $24,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + psrld $8,%mm1 + psrld $8,%mm5 + movl -128(%ebp),%eax + pslld $24,%mm3 + pslld $24,%mm7 + movl -64(%ebp),%ebx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl (%ebp),%ecx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl 64(%ebp),%edx + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L001loop +.align 16 +.L002out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact +.type _x86_AES_encrypt,@function +.align 16 +_x86_AES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L003loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl (%ebp,%esi,8),%esi + movzbl %ch,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movzbl %bh,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + movl (%ebp,%edx,8),%edx + movzbl %ah,%eax + xorl 3(%ebp,%eax,8),%edx + movl 4(%esp),%eax + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + xorl 1(%ebp,%ecx,8),%edx + movl %esi,%ecx + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L003loop + movl %eax,%esi + andl $255,%esi + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %bh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %ch,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %dh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movzbl %bh,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movl 2(%ebp,%edx,8),%edx + andl $255,%edx + movzbl %ah,%eax + movl (%ebp,%eax,8),%eax + andl $65280,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movl (%ebp,%ebx,8),%ebx + andl $16711680,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movl 2(%ebp,%ecx,8),%ecx + andl $4278190080,%ecx + xorl %ecx,%edx + movl %esi,%ecx + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Te: +.long 2774754246,2774754246 +.long 2222750968,2222750968 +.long 2574743534,2574743534 +.long 2373680118,2373680118 +.long 234025727,234025727 +.long 3177933782,3177933782 +.long 2976870366,2976870366 +.long 1422247313,1422247313 +.long 1345335392,1345335392 +.long 50397442,50397442 +.long 2842126286,2842126286 +.long 2099981142,2099981142 +.long 436141799,436141799 +.long 1658312629,1658312629 +.long 3870010189,3870010189 +.long 2591454956,2591454956 +.long 1170918031,1170918031 +.long 2642575903,2642575903 +.long 1086966153,1086966153 +.long 2273148410,2273148410 +.long 368769775,368769775 +.long 3948501426,3948501426 +.long 3376891790,3376891790 +.long 200339707,200339707 +.long 3970805057,3970805057 +.long 1742001331,1742001331 +.long 4255294047,4255294047 +.long 3937382213,3937382213 +.long 3214711843,3214711843 +.long 4154762323,4154762323 +.long 2524082916,2524082916 +.long 1539358875,1539358875 +.long 3266819957,3266819957 +.long 486407649,486407649 +.long 2928907069,2928907069 +.long 1780885068,1780885068 +.long 1513502316,1513502316 +.long 1094664062,1094664062 +.long 49805301,49805301 +.long 1338821763,1338821763 +.long 1546925160,1546925160 +.long 4104496465,4104496465 +.long 887481809,887481809 +.long 150073849,150073849 +.long 2473685474,2473685474 +.long 1943591083,1943591083 +.long 1395732834,1395732834 +.long 1058346282,1058346282 +.long 201589768,201589768 +.long 1388824469,1388824469 +.long 1696801606,1696801606 +.long 1589887901,1589887901 +.long 672667696,672667696 +.long 2711000631,2711000631 +.long 251987210,251987210 +.long 3046808111,3046808111 +.long 151455502,151455502 +.long 907153956,907153956 +.long 2608889883,2608889883 +.long 1038279391,1038279391 +.long 652995533,652995533 +.long 1764173646,1764173646 +.long 3451040383,3451040383 +.long 2675275242,2675275242 +.long 453576978,453576978 +.long 2659418909,2659418909 +.long 1949051992,1949051992 +.long 773462580,773462580 +.long 756751158,756751158 +.long 2993581788,2993581788 +.long 3998898868,3998898868 +.long 4221608027,4221608027 +.long 4132590244,4132590244 +.long 1295727478,1295727478 +.long 1641469623,1641469623 +.long 3467883389,3467883389 +.long 2066295122,2066295122 +.long 1055122397,1055122397 +.long 1898917726,1898917726 +.long 2542044179,2542044179 +.long 4115878822,4115878822 +.long 1758581177,1758581177 +.long 0,0 +.long 753790401,753790401 +.long 1612718144,1612718144 +.long 536673507,536673507 +.long 3367088505,3367088505 +.long 3982187446,3982187446 +.long 3194645204,3194645204 +.long 1187761037,1187761037 +.long 3653156455,3653156455 +.long 1262041458,1262041458 +.long 3729410708,3729410708 +.long 3561770136,3561770136 +.long 3898103984,3898103984 +.long 1255133061,1255133061 +.long 1808847035,1808847035 +.long 720367557,720367557 +.long 3853167183,3853167183 +.long 385612781,385612781 +.long 3309519750,3309519750 +.long 3612167578,3612167578 +.long 1429418854,1429418854 +.long 2491778321,2491778321 +.long 3477423498,3477423498 +.long 284817897,284817897 +.long 100794884,100794884 +.long 2172616702,2172616702 +.long 4031795360,4031795360 +.long 1144798328,1144798328 +.long 3131023141,3131023141 +.long 3819481163,3819481163 +.long 4082192802,4082192802 +.long 4272137053,4272137053 +.long 3225436288,3225436288 +.long 2324664069,2324664069 +.long 2912064063,2912064063 +.long 3164445985,3164445985 +.long 1211644016,1211644016 +.long 83228145,83228145 +.long 3753688163,3753688163 +.long 3249976951,3249976951 +.long 1977277103,1977277103 +.long 1663115586,1663115586 +.long 806359072,806359072 +.long 452984805,452984805 +.long 250868733,250868733 +.long 1842533055,1842533055 +.long 1288555905,1288555905 +.long 336333848,336333848 +.long 890442534,890442534 +.long 804056259,804056259 +.long 3781124030,3781124030 +.long 2727843637,2727843637 +.long 3427026056,3427026056 +.long 957814574,957814574 +.long 1472513171,1472513171 +.long 4071073621,4071073621 +.long 2189328124,2189328124 +.long 1195195770,1195195770 +.long 2892260552,2892260552 +.long 3881655738,3881655738 +.long 723065138,723065138 +.long 2507371494,2507371494 +.long 2690670784,2690670784 +.long 2558624025,2558624025 +.long 3511635870,3511635870 +.long 2145180835,2145180835 +.long 1713513028,1713513028 +.long 2116692564,2116692564 +.long 2878378043,2878378043 +.long 2206763019,2206763019 +.long 3393603212,3393603212 +.long 703524551,703524551 +.long 3552098411,3552098411 +.long 1007948840,1007948840 +.long 2044649127,2044649127 +.long 3797835452,3797835452 +.long 487262998,487262998 +.long 1994120109,1994120109 +.long 1004593371,1004593371 +.long 1446130276,1446130276 +.long 1312438900,1312438900 +.long 503974420,503974420 +.long 3679013266,3679013266 +.long 168166924,168166924 +.long 1814307912,1814307912 +.long 3831258296,3831258296 +.long 1573044895,1573044895 +.long 1859376061,1859376061 +.long 4021070915,4021070915 +.long 2791465668,2791465668 +.long 2828112185,2828112185 +.long 2761266481,2761266481 +.long 937747667,937747667 +.long 2339994098,2339994098 +.long 854058965,854058965 +.long 1137232011,1137232011 +.long 1496790894,1496790894 +.long 3077402074,3077402074 +.long 2358086913,2358086913 +.long 1691735473,1691735473 +.long 3528347292,3528347292 +.long 3769215305,3769215305 +.long 3027004632,3027004632 +.long 4199962284,4199962284 +.long 133494003,133494003 +.long 636152527,636152527 +.long 2942657994,2942657994 +.long 2390391540,2390391540 +.long 3920539207,3920539207 +.long 403179536,403179536 +.long 3585784431,3585784431 +.long 2289596656,2289596656 +.long 1864705354,1864705354 +.long 1915629148,1915629148 +.long 605822008,605822008 +.long 4054230615,4054230615 +.long 3350508659,3350508659 +.long 1371981463,1371981463 +.long 602466507,602466507 +.long 2094914977,2094914977 +.long 2624877800,2624877800 +.long 555687742,555687742 +.long 3712699286,3712699286 +.long 3703422305,3703422305 +.long 2257292045,2257292045 +.long 2240449039,2240449039 +.long 2423288032,2423288032 +.long 1111375484,1111375484 +.long 3300242801,3300242801 +.long 2858837708,2858837708 +.long 3628615824,3628615824 +.long 84083462,84083462 +.long 32962295,32962295 +.long 302911004,302911004 +.long 2741068226,2741068226 +.long 1597322602,1597322602 +.long 4183250862,4183250862 +.long 3501832553,3501832553 +.long 2441512471,2441512471 +.long 1489093017,1489093017 +.long 656219450,656219450 +.long 3114180135,3114180135 +.long 954327513,954327513 +.long 335083755,335083755 +.long 3013122091,3013122091 +.long 856756514,856756514 +.long 3144247762,3144247762 +.long 1893325225,1893325225 +.long 2307821063,2307821063 +.long 2811532339,2811532339 +.long 3063651117,3063651117 +.long 572399164,572399164 +.long 2458355477,2458355477 +.long 552200649,552200649 +.long 1238290055,1238290055 +.long 4283782570,4283782570 +.long 2015897680,2015897680 +.long 2061492133,2061492133 +.long 2408352771,2408352771 +.long 4171342169,4171342169 +.long 2156497161,2156497161 +.long 386731290,386731290 +.long 3669999461,3669999461 +.long 837215959,837215959 +.long 3326231172,3326231172 +.long 3093850320,3093850320 +.long 3275833730,3275833730 +.long 2962856233,2962856233 +.long 1999449434,1999449434 +.long 286199582,286199582 +.long 3417354363,3417354363 +.long 4233385128,4233385128 +.long 3602627437,3602627437 +.long 974525996,974525996 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.long 1,2,4,8 +.long 16,32,64,128 +.long 27,54,0,0 +.long 0,0,0,0 +.size _x86_AES_encrypt,.-_x86_AES_encrypt +.globl AES_encrypt +.type AES_encrypt,@function +.align 16 +AES_encrypt: +.L_AES_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L004pic_point +.L004pic_point: + popl %ebp + leal OPENSSL_ia32cap_P-.L004pic_point(%ebp),%eax + leal .LAES_Te-.L004pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L005x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L005x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_encrypt,.-.L_AES_encrypt_begin +.type _x86_AES_decrypt_compact,@function +.align 16 +_x86_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L006loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%eax + subl %edi,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %ecx,%eax + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ecx,%ebx + roll $8,%ecx + xorl %esi,%ebp + xorl %eax,%ecx + xorl %ebp,%eax + xorl %ebx,%ecx + xorl %ebp,%ebx + roll $24,%eax + xorl %ebp,%ecx + roll $16,%ebx + xorl %eax,%ecx + roll $8,%ebp + xorl %ebx,%ecx + movl 4(%esp),%eax + xorl %ebp,%ecx + movl %ecx,12(%esp) + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %edx,%ebx + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %edx,%ecx + roll $8,%edx + xorl %esi,%ebp + xorl %ebx,%edx + xorl %ebp,%ebx + xorl %ecx,%edx + xorl %ebp,%ecx + roll $24,%ebx + xorl %ebp,%edx + roll $16,%ecx + xorl %ebx,%edx + roll $8,%ebp + xorl %ecx,%edx + movl 8(%esp),%ebx + xorl %ebp,%edx + movl %edx,16(%esp) + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %eax,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %eax,%edx + roll $8,%eax + xorl %esi,%ebp + xorl %ecx,%eax + xorl %ebp,%ecx + xorl %edx,%eax + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%eax + roll $16,%edx + xorl %ecx,%eax + roll $8,%ebp + xorl %edx,%eax + xorl %ebp,%eax + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ebx,%edx + roll $8,%ebx + xorl %esi,%ebp + xorl %ecx,%ebx + xorl %ebp,%ecx + xorl %edx,%ebx + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%ebp + xorl %edx,%ebx + movl 12(%esp),%ecx + xorl %ebp,%ebx + movl 16(%esp),%edx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L006loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact +.type _sse_AES_decrypt_compact,@function +.align 16 +_sse_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L007loop: + pshufw $12,%mm0,%mm1 + pshufw $9,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $6,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $3,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movd %mm2,%eax + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $16,%esi + movd %mm6,%ebx + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %al,%edi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $16,%esi + shrl $16,%eax + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shrl $16,%ebx + shll $8,%esi + movd %edx,%mm1 + movzbl -128(%ebp,%edi,1),%edx + movzbl %bh,%edi + shll $24,%edx + andl $255,%ebx + orl %esi,%edx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movzbl %ah,%eax + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + orl %ebx,%edx + shll $16,%esi + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%edx + shll $24,%eax + orl %eax,%ecx + movl 20(%esp),%edi + movd %edx,%mm4 + movd %ecx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L008out + movq %mm0,%mm3 + movq %mm4,%mm7 + pshufw $228,%mm0,%mm2 + pshufw $228,%mm4,%mm6 + movq %mm0,%mm1 + movq %mm4,%mm5 + pshufw $177,%mm0,%mm0 + pshufw $177,%mm4,%mm4 + pslld $8,%mm2 + pslld $8,%mm6 + psrld $8,%mm3 + psrld $8,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pslld $16,%mm2 + pslld $16,%mm6 + psrld $16,%mm3 + psrld $16,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movq 8(%esp),%mm3 + pxor %mm2,%mm2 + pxor %mm6,%mm6 + pcmpgtb %mm1,%mm2 + pcmpgtb %mm5,%mm6 + pand %mm3,%mm2 + pand %mm3,%mm6 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm2,%mm1 + pxor %mm6,%mm5 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq %mm1,%mm2 + movq %mm5,%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pslld $24,%mm3 + pslld $24,%mm7 + psrld $8,%mm2 + psrld $8,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pshufw $177,%mm1,%mm3 + pshufw $177,%mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + pshufw $177,%mm1,%mm2 + pshufw $177,%mm5,%mm6 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pslld $8,%mm1 + pslld $8,%mm5 + psrld $8,%mm3 + psrld $8,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl -128(%ebp),%eax + pslld $16,%mm1 + pslld $16,%mm5 + movl -64(%ebp),%ebx + psrld $16,%mm3 + psrld $16,%mm7 + movl (%ebp),%ecx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl 64(%ebp),%edx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L007loop +.align 16 +.L008out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact +.type _x86_AES_decrypt,@function +.align 16 +_x86_AES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L009loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ebx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %ah,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + andl $255,%edx + movl (%ebp,%edx,8),%edx + movzbl %ch,%ecx + xorl 3(%ebp,%ecx,8),%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + shrl $24,%eax + xorl 1(%ebp,%eax,8),%edx + movl 4(%esp),%eax + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L009loop + leal 2176(%ebp),%ebp + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi + leal -128(%ebp),%ebp + movl %eax,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl (%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl (%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl (%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl (%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + leal -2048(%ebp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Td: +.long 1353184337,1353184337 +.long 1399144830,1399144830 +.long 3282310938,3282310938 +.long 2522752826,2522752826 +.long 3412831035,3412831035 +.long 4047871263,4047871263 +.long 2874735276,2874735276 +.long 2466505547,2466505547 +.long 1442459680,1442459680 +.long 4134368941,4134368941 +.long 2440481928,2440481928 +.long 625738485,625738485 +.long 4242007375,4242007375 +.long 3620416197,3620416197 +.long 2151953702,2151953702 +.long 2409849525,2409849525 +.long 1230680542,1230680542 +.long 1729870373,1729870373 +.long 2551114309,2551114309 +.long 3787521629,3787521629 +.long 41234371,41234371 +.long 317738113,317738113 +.long 2744600205,2744600205 +.long 3338261355,3338261355 +.long 3881799427,3881799427 +.long 2510066197,2510066197 +.long 3950669247,3950669247 +.long 3663286933,3663286933 +.long 763608788,763608788 +.long 3542185048,3542185048 +.long 694804553,694804553 +.long 1154009486,1154009486 +.long 1787413109,1787413109 +.long 2021232372,2021232372 +.long 1799248025,1799248025 +.long 3715217703,3715217703 +.long 3058688446,3058688446 +.long 397248752,397248752 +.long 1722556617,1722556617 +.long 3023752829,3023752829 +.long 407560035,407560035 +.long 2184256229,2184256229 +.long 1613975959,1613975959 +.long 1165972322,1165972322 +.long 3765920945,3765920945 +.long 2226023355,2226023355 +.long 480281086,480281086 +.long 2485848313,2485848313 +.long 1483229296,1483229296 +.long 436028815,436028815 +.long 2272059028,2272059028 +.long 3086515026,3086515026 +.long 601060267,601060267 +.long 3791801202,3791801202 +.long 1468997603,1468997603 +.long 715871590,715871590 +.long 120122290,120122290 +.long 63092015,63092015 +.long 2591802758,2591802758 +.long 2768779219,2768779219 +.long 4068943920,4068943920 +.long 2997206819,2997206819 +.long 3127509762,3127509762 +.long 1552029421,1552029421 +.long 723308426,723308426 +.long 2461301159,2461301159 +.long 4042393587,4042393587 +.long 2715969870,2715969870 +.long 3455375973,3455375973 +.long 3586000134,3586000134 +.long 526529745,526529745 +.long 2331944644,2331944644 +.long 2639474228,2639474228 +.long 2689987490,2689987490 +.long 853641733,853641733 +.long 1978398372,1978398372 +.long 971801355,971801355 +.long 2867814464,2867814464 +.long 111112542,111112542 +.long 1360031421,1360031421 +.long 4186579262,4186579262 +.long 1023860118,1023860118 +.long 2919579357,2919579357 +.long 1186850381,1186850381 +.long 3045938321,3045938321 +.long 90031217,90031217 +.long 1876166148,1876166148 +.long 4279586912,4279586912 +.long 620468249,620468249 +.long 2548678102,2548678102 +.long 3426959497,3426959497 +.long 2006899047,2006899047 +.long 3175278768,3175278768 +.long 2290845959,2290845959 +.long 945494503,945494503 +.long 3689859193,3689859193 +.long 1191869601,1191869601 +.long 3910091388,3910091388 +.long 3374220536,3374220536 +.long 0,0 +.long 2206629897,2206629897 +.long 1223502642,1223502642 +.long 2893025566,2893025566 +.long 1316117100,1316117100 +.long 4227796733,4227796733 +.long 1446544655,1446544655 +.long 517320253,517320253 +.long 658058550,658058550 +.long 1691946762,1691946762 +.long 564550760,564550760 +.long 3511966619,3511966619 +.long 976107044,976107044 +.long 2976320012,2976320012 +.long 266819475,266819475 +.long 3533106868,3533106868 +.long 2660342555,2660342555 +.long 1338359936,1338359936 +.long 2720062561,2720062561 +.long 1766553434,1766553434 +.long 370807324,370807324 +.long 179999714,179999714 +.long 3844776128,3844776128 +.long 1138762300,1138762300 +.long 488053522,488053522 +.long 185403662,185403662 +.long 2915535858,2915535858 +.long 3114841645,3114841645 +.long 3366526484,3366526484 +.long 2233069911,2233069911 +.long 1275557295,1275557295 +.long 3151862254,3151862254 +.long 4250959779,4250959779 +.long 2670068215,2670068215 +.long 3170202204,3170202204 +.long 3309004356,3309004356 +.long 880737115,880737115 +.long 1982415755,1982415755 +.long 3703972811,3703972811 +.long 1761406390,1761406390 +.long 1676797112,1676797112 +.long 3403428311,3403428311 +.long 277177154,277177154 +.long 1076008723,1076008723 +.long 538035844,538035844 +.long 2099530373,2099530373 +.long 4164795346,4164795346 +.long 288553390,288553390 +.long 1839278535,1839278535 +.long 1261411869,1261411869 +.long 4080055004,4080055004 +.long 3964831245,3964831245 +.long 3504587127,3504587127 +.long 1813426987,1813426987 +.long 2579067049,2579067049 +.long 4199060497,4199060497 +.long 577038663,577038663 +.long 3297574056,3297574056 +.long 440397984,440397984 +.long 3626794326,3626794326 +.long 4019204898,4019204898 +.long 3343796615,3343796615 +.long 3251714265,3251714265 +.long 4272081548,4272081548 +.long 906744984,906744984 +.long 3481400742,3481400742 +.long 685669029,685669029 +.long 646887386,646887386 +.long 2764025151,2764025151 +.long 3835509292,3835509292 +.long 227702864,227702864 +.long 2613862250,2613862250 +.long 1648787028,1648787028 +.long 3256061430,3256061430 +.long 3904428176,3904428176 +.long 1593260334,1593260334 +.long 4121936770,4121936770 +.long 3196083615,3196083615 +.long 2090061929,2090061929 +.long 2838353263,2838353263 +.long 3004310991,3004310991 +.long 999926984,999926984 +.long 2809993232,2809993232 +.long 1852021992,1852021992 +.long 2075868123,2075868123 +.long 158869197,158869197 +.long 4095236462,4095236462 +.long 28809964,28809964 +.long 2828685187,2828685187 +.long 1701746150,1701746150 +.long 2129067946,2129067946 +.long 147831841,147831841 +.long 3873969647,3873969647 +.long 3650873274,3650873274 +.long 3459673930,3459673930 +.long 3557400554,3557400554 +.long 3598495785,3598495785 +.long 2947720241,2947720241 +.long 824393514,824393514 +.long 815048134,815048134 +.long 3227951669,3227951669 +.long 935087732,935087732 +.long 2798289660,2798289660 +.long 2966458592,2966458592 +.long 366520115,366520115 +.long 1251476721,1251476721 +.long 4158319681,4158319681 +.long 240176511,240176511 +.long 804688151,804688151 +.long 2379631990,2379631990 +.long 1303441219,1303441219 +.long 1414376140,1414376140 +.long 3741619940,3741619940 +.long 3820343710,3820343710 +.long 461924940,461924940 +.long 3089050817,3089050817 +.long 2136040774,2136040774 +.long 82468509,82468509 +.long 1563790337,1563790337 +.long 1937016826,1937016826 +.long 776014843,776014843 +.long 1511876531,1511876531 +.long 1389550482,1389550482 +.long 861278441,861278441 +.long 323475053,323475053 +.long 2355222426,2355222426 +.long 2047648055,2047648055 +.long 2383738969,2383738969 +.long 2302415851,2302415851 +.long 3995576782,3995576782 +.long 902390199,902390199 +.long 3991215329,3991215329 +.long 1018251130,1018251130 +.long 1507840668,1507840668 +.long 1064563285,1064563285 +.long 2043548696,2043548696 +.long 3208103795,3208103795 +.long 3939366739,3939366739 +.long 1537932639,1537932639 +.long 342834655,342834655 +.long 2262516856,2262516856 +.long 2180231114,2180231114 +.long 1053059257,1053059257 +.long 741614648,741614648 +.long 1598071746,1598071746 +.long 1925389590,1925389590 +.long 203809468,203809468 +.long 2336832552,2336832552 +.long 1100287487,1100287487 +.long 1895934009,1895934009 +.long 3736275976,3736275976 +.long 2632234200,2632234200 +.long 2428589668,2428589668 +.long 1636092795,1636092795 +.long 1890988757,1890988757 +.long 1952214088,1952214088 +.long 1113045200,1113045200 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.size _x86_AES_decrypt,.-_x86_AES_decrypt +.globl AES_decrypt +.type AES_decrypt,@function +.align 16 +AES_decrypt: +.L_AES_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L010pic_point +.L010pic_point: + popl %ebp + leal OPENSSL_ia32cap_P-.L010pic_point(%ebp),%eax + leal .LAES_Td-.L010pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L011x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L011x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_decrypt,.-.L_AES_decrypt_begin +.globl AES_cbc_encrypt +.type AES_cbc_encrypt,@function +.align 16 +AES_cbc_encrypt: +.L_AES_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%ecx + cmpl $0,%ecx + je .L012drop_out + call .L013pic_point +.L013pic_point: + popl %ebp + leal OPENSSL_ia32cap_P-.L013pic_point(%ebp),%eax + cmpl $0,40(%esp) + leal .LAES_Te-.L013pic_point(%ebp),%ebp + jne .L014picked_te + leal .LAES_Td-.LAES_Te(%ebp),%ebp +.L014picked_te: + pushfl + cld + cmpl $512,%ecx + jb .L015slow_way + testl $15,%ecx + jnz .L015slow_way + btl $28,(%eax) + jc .L015slow_way + leal -324(%esp),%esi + andl $-64,%esi + movl %ebp,%eax + leal 2304(%ebp),%ebx + movl %esi,%edx + andl $4095,%eax + andl $4095,%ebx + andl $4095,%edx + cmpl %ebx,%edx + jb .L016tbl_break_out + subl %ebx,%edx + subl %edx,%esi + jmp .L017tbl_ok +.align 4 +.L016tbl_break_out: + subl %eax,%edx + andl $4095,%edx + addl $384,%edx + subl %edx,%esi +.align 4 +.L017tbl_ok: + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 12(%edx),%edi + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl $0,316(%esp) + movl %edi,%ebx + movl $61,%ecx + subl %ebp,%ebx + movl %edi,%esi + andl $4095,%ebx + leal 76(%esp),%edi + cmpl $2304,%ebx + jb .L018do_copy + cmpl $3852,%ebx + jb .L019skip_copy +.align 4 +.L018do_copy: + movl %edi,44(%esp) +.long 2784229001 +.L019skip_copy: + movl $16,%edi +.align 4 +.L020prefetch_tbl: + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%esi + leal 128(%ebp),%ebp + subl $1,%edi + jnz .L020prefetch_tbl + subl $2048,%ebp + movl 32(%esp),%esi + movl 48(%esp),%edi + cmpl $0,%edx + je .L021fast_decrypt + movl (%edi),%eax + movl 4(%edi),%ebx +.align 16 +.L022fast_enc_loop: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + leal 16(%esi),%esi + movl 40(%esp),%ecx + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L022fast_enc_loop + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L023skip_ezero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L023skip_ezero: + movl 28(%esp),%esp + popfl +.L012drop_out: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L021fast_decrypt: + cmpl 36(%esp),%esi + je .L024fast_dec_in_place + movl %edi,52(%esp) +.align 4 +.align 16 +.L025fast_dec_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 52(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 36(%esp),%edi + movl 32(%esp),%esi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl %esi,52(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edi + movl %edi,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L025fast_dec_loop + movl 52(%esp),%edi + movl 48(%esp),%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + jmp .L026fast_dec_out +.align 16 +.L024fast_dec_in_place: +.L027fast_dec_in_place_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 48(%esp),%edi + movl 36(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L027fast_dec_in_place_loop +.align 4 +.L026fast_dec_out: + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L028skip_dzero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L028skip_dzero: + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L015slow_way: + movl (%eax),%eax + movl 36(%esp),%edi + leal -80(%esp),%esi + andl $-64,%esi + leal -143(%edi),%ebx + subl %esi,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esi + leal 768(%esi),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl %eax,52(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl %esi,%edi + movl %eax,%esi + cmpl $0,%edx + je .L029slow_decrypt + cmpl $16,%ecx + movl %ebx,%edx + jb .L030slow_enc_tail + btl $25,52(%esp) + jnc .L031slow_enc_x86 + movq (%edi),%mm0 + movq 8(%edi),%mm4 +.align 16 +.L032slow_enc_loop_sse: + pxor (%esi),%mm0 + pxor 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl 40(%esp),%ecx + movq %mm0,(%edi) + movq %mm4,8(%edi) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L032slow_enc_loop_sse + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L031slow_enc_x86: + movl (%edi),%eax + movl 4(%edi),%ebx +.align 4 +.L033slow_enc_loop_x86: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L033slow_enc_loop_x86 + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L030slow_enc_tail: + emms + movl %edx,%edi + movl $16,%ebx + subl %ecx,%ebx + cmpl %esi,%edi + je .L034enc_in_place +.align 4 +.long 2767451785 + jmp .L035enc_skip_in_place +.L034enc_in_place: + leal (%edi,%ecx,1),%edi +.L035enc_skip_in_place: + movl %ebx,%ecx + xorl %eax,%eax +.align 4 +.long 2868115081 + movl 48(%esp),%edi + movl %edx,%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl $16,40(%esp) + jmp .L033slow_enc_loop_x86 +.align 16 +.L029slow_decrypt: + btl $25,52(%esp) + jnc .L036slow_dec_loop_x86 +.align 4 +.L037slow_dec_loop_sse: + movq (%esi),%mm0 + movq 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_decrypt_compact + movl 32(%esp),%esi + leal 60(%esp),%eax + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl 48(%esp),%edi + movq (%esi),%mm1 + movq 8(%esi),%mm5 + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movq %mm1,(%edi) + movq %mm5,8(%edi) + subl $16,%ecx + jc .L038slow_dec_partial_sse + movq %mm0,(%ebx) + movq %mm4,8(%ebx) + leal 16(%ebx),%ebx + movl %ebx,36(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + movl %ecx,40(%esp) + jnz .L037slow_dec_loop_sse + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L038slow_dec_partial_sse: + movq %mm0,(%eax) + movq %mm4,8(%eax) + emms + addl $16,%ecx + movl %ebx,%edi + movl %eax,%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L036slow_dec_loop_x86: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt_compact + movl 48(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + subl $16,%esi + jc .L039slow_dec_partial_x86 + movl %esi,40(%esp) + movl 36(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + leal 16(%esi),%esi + movl %esi,32(%esp) + jnz .L036slow_dec_loop_x86 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L039slow_dec_partial_x86: + leal 60(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 32(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl 36(%esp),%edi + leal 60(%esp),%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin +.type _x86_AES_set_encrypt_key,@function +.align 16 +_x86_AES_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%esi + movl 32(%esp),%edi + testl $-1,%esi + jz .L040badpointer + testl $-1,%edi + jz .L040badpointer + call .L041pic_point +.L041pic_point: + popl %ebp + leal .LAES_Te-.L041pic_point(%ebp),%ebp + leal 2176(%ebp),%ebp + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx + movl 28(%esp),%ecx + cmpl $128,%ecx + je .L04210rounds + cmpl $192,%ecx + je .L04312rounds + cmpl $256,%ecx + je .L04414rounds + movl $-2,%eax + jmp .L045exit +.L04210rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + xorl %ecx,%ecx + jmp .L04610shortcut +.align 4 +.L04710loop: + movl (%edi),%eax + movl 12(%edi),%edx +.L04610shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,16(%edi) + xorl 4(%edi),%eax + movl %eax,20(%edi) + xorl 8(%edi),%eax + movl %eax,24(%edi) + xorl 12(%edi),%eax + movl %eax,28(%edi) + incl %ecx + addl $16,%edi + cmpl $10,%ecx + jl .L04710loop + movl $10,80(%edi) + xorl %eax,%eax + jmp .L045exit +.L04312rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%ecx + movl 20(%esi),%edx + movl %ecx,16(%edi) + movl %edx,20(%edi) + xorl %ecx,%ecx + jmp .L04812shortcut +.align 4 +.L04912loop: + movl (%edi),%eax + movl 20(%edi),%edx +.L04812shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,24(%edi) + xorl 4(%edi),%eax + movl %eax,28(%edi) + xorl 8(%edi),%eax + movl %eax,32(%edi) + xorl 12(%edi),%eax + movl %eax,36(%edi) + cmpl $7,%ecx + je .L05012break + incl %ecx + xorl 16(%edi),%eax + movl %eax,40(%edi) + xorl 20(%edi),%eax + movl %eax,44(%edi) + addl $24,%edi + jmp .L04912loop +.L05012break: + movl $12,72(%edi) + xorl %eax,%eax + jmp .L045exit +.L04414rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,16(%edi) + movl %ebx,20(%edi) + movl %ecx,24(%edi) + movl %edx,28(%edi) + xorl %ecx,%ecx + jmp .L05114shortcut +.align 4 +.L05214loop: + movl 28(%edi),%edx +.L05114shortcut: + movl (%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,32(%edi) + xorl 4(%edi),%eax + movl %eax,36(%edi) + xorl 8(%edi),%eax + movl %eax,40(%edi) + xorl 12(%edi),%eax + movl %eax,44(%edi) + cmpl $6,%ecx + je .L05314break + incl %ecx + movl %eax,%edx + movl 16(%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + shll $8,%ebx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $16,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $24,%ebx + xorl %ebx,%eax + movl %eax,48(%edi) + xorl 20(%edi),%eax + movl %eax,52(%edi) + xorl 24(%edi),%eax + movl %eax,56(%edi) + xorl 28(%edi),%eax + movl %eax,60(%edi) + addl $32,%edi + jmp .L05214loop +.L05314break: + movl $14,48(%edi) + xorl %eax,%eax + jmp .L045exit +.L040badpointer: + movl $-1,%eax +.L045exit: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,@function +.align 16 +AES_set_encrypt_key: +.L_AES_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + ret +.size AES_set_encrypt_key,.-.L_AES_set_encrypt_key_begin +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,@function +.align 16 +AES_set_decrypt_key: +.L_AES_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + cmpl $0,%eax + je .L054proceed + ret +.L054proceed: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%esi + movl 240(%esi),%ecx + leal (,%ecx,4),%ecx + leal (%esi,%ecx,4),%edi +.align 4 +.L055invert: + movl (%esi),%eax + movl 4(%esi),%ebx + movl (%edi),%ecx + movl 4(%edi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,(%esi) + movl %edx,4(%esi) + movl 8(%esi),%eax + movl 12(%esi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,8(%edi) + movl %ebx,12(%edi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + addl $16,%esi + subl $16,%edi + cmpl %edi,%esi + jne .L055invert + movl 28(%esp),%edi + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,28(%esp) + movl 16(%edi),%eax +.align 4 +.L056permute: + addl $16,%edi + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %eax,%ebx + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + xorl %eax,%ecx + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + roll $8,%eax + xorl %esi,%edx + movl 4(%edi),%ebp + xorl %ebx,%eax + xorl %edx,%ebx + xorl %ecx,%eax + roll $24,%ebx + xorl %edx,%ecx + xorl %edx,%eax + roll $16,%ecx + xorl %ebx,%eax + roll $8,%edx + xorl %ecx,%eax + movl %ebp,%ebx + xorl %edx,%eax + movl %eax,(%edi) + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + xorl %ebx,%edx + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + roll $8,%ebx + xorl %esi,%eax + movl 8(%edi),%ebp + xorl %ecx,%ebx + xorl %eax,%ecx + xorl %edx,%ebx + roll $24,%ecx + xorl %eax,%edx + xorl %eax,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%eax + xorl %edx,%ebx + movl %ebp,%ecx + xorl %eax,%ebx + movl %ebx,4(%edi) + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %ecx,%edx + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + xorl %ecx,%eax + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + roll $8,%ecx + xorl %esi,%ebx + movl 12(%edi),%ebp + xorl %edx,%ecx + xorl %ebx,%edx + xorl %eax,%ecx + roll $24,%edx + xorl %ebx,%eax + xorl %ebx,%ecx + roll $16,%eax + xorl %edx,%ecx + roll $8,%ebx + xorl %eax,%ecx + movl %ebp,%edx + xorl %ebx,%ecx + movl %ecx,8(%edi) + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %edx,%eax + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + xorl %edx,%ebx + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + roll $8,%edx + xorl %esi,%ecx + movl 16(%edi),%ebp + xorl %eax,%edx + xorl %ecx,%eax + xorl %ebx,%edx + roll $24,%eax + xorl %ecx,%ebx + xorl %ecx,%edx + roll $16,%ebx + xorl %eax,%edx + roll $8,%ecx + xorl %ebx,%edx + movl %ebp,%eax + xorl %ecx,%edx + movl %edx,12(%edi) + cmpl 28(%esp),%edi + jb .L056permute + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_set_decrypt_key,.-.L_AES_set_decrypt_key_begin +.byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89 +.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 +.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: +#else +.text +.type _x86_AES_encrypt_compact,@function +.align 16 +_x86_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L000loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ecx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ecx,%edi + xorl %esi,%ecx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ecx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%ecx + andl %edx,%ebp + leal (%edx,%edx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %edx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %edx,%edi + xorl %esi,%edx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%edx + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%edx + andl %eax,%ebp + leal (%eax,%eax,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %eax,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %eax,%edi + xorl %esi,%eax + rorl $24,%edi + xorl %ebp,%esi + roll $24,%eax + xorl %edi,%esi + movl $2155905152,%ebp + xorl %esi,%eax + andl %ebx,%ebp + leal (%ebx,%ebx,1),%edi + movl %ebp,%esi + shrl $7,%ebp + andl $4278124286,%edi + subl %ebp,%esi + movl %ebx,%ebp + andl $454761243,%esi + rorl $16,%ebp + xorl %edi,%esi + movl %ebx,%edi + xorl %esi,%ebx + rorl $24,%edi + xorl %ebp,%esi + roll $24,%ebx + xorl %edi,%esi + xorl %esi,%ebx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L000loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movzbl -128(%ebp,%esi,1),%esi + movzbl %ch,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ah,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $8,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movzbl -128(%ebp,%ecx,1),%ecx + shll $24,%ecx + xorl %ecx,%edx + movl %esi,%ecx + + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_encrypt_compact,.-_x86_AES_encrypt_compact +.type _sse_AES_encrypt_compact,@function +.align 16 +_sse_AES_encrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L001loop: + pshufw $8,%mm0,%mm1 + pshufw $13,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $13,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $8,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $8,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + movd %mm2,%eax + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + movd %mm6,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $8,%esi + shrl $16,%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shrl $16,%eax + movd %ecx,%mm1 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %ah,%edi + shll $16,%ecx + andl $255,%eax + orl %esi,%ecx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + andl $255,%ebx + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%ecx + shll $16,%eax + movzbl -128(%ebp,%edi,1),%esi + orl %eax,%edx + shll $8,%esi + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + orl %ebx,%edx + movl 20(%esp),%edi + movd %ecx,%mm4 + movd %edx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L002out + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + movq %mm0,%mm1 + movq %mm4,%mm5 + pcmpgtb %mm0,%mm3 + pcmpgtb %mm4,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + pshufw $177,%mm0,%mm2 + pshufw $177,%mm4,%mm6 + paddb %mm0,%mm0 + paddb %mm4,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pshufw $177,%mm2,%mm3 + pshufw $177,%mm6,%mm7 + pxor %mm0,%mm1 + pxor %mm4,%mm5 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm3,%mm2 + movq %mm7,%mm6 + pslld $8,%mm3 + pslld $8,%mm7 + psrld $24,%mm2 + psrld $24,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + psrld $8,%mm1 + psrld $8,%mm5 + movl -128(%ebp),%eax + pslld $24,%mm3 + pslld $24,%mm7 + movl -64(%ebp),%ebx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl (%ebp),%ecx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl 64(%ebp),%edx + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L001loop +.align 16 +.L002out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_encrypt_compact,.-_sse_AES_encrypt_compact +.type _x86_AES_encrypt,@function +.align 16 +_x86_AES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L003loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl (%ebp,%esi,8),%esi + movzbl %ch,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movzbl %bh,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + movl (%ebp,%edx,8),%edx + movzbl %ah,%eax + xorl 3(%ebp,%eax,8),%edx + movl 4(%esp),%eax + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + xorl 1(%ebp,%ecx,8),%edx + movl %esi,%ecx + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L003loop + movl %eax,%esi + andl $255,%esi + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %bh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + shrl $16,%ebx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %ch,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $24,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + shrl $24,%ecx + movl 2(%ebp,%esi,8),%esi + andl $255,%esi + movzbl %dh,%edi + movl (%ebp,%edi,8),%edi + andl $65280,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edx + andl $255,%edi + movl (%ebp,%edi,8),%edi + andl $16711680,%edi + xorl %edi,%esi + movzbl %bh,%edi + movl 2(%ebp,%edi,8),%edi + andl $4278190080,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movl 2(%ebp,%edx,8),%edx + andl $255,%edx + movzbl %ah,%eax + movl (%ebp,%eax,8),%eax + andl $65280,%eax + xorl %eax,%edx + movl 4(%esp),%eax + andl $255,%ebx + movl (%ebp,%ebx,8),%ebx + andl $16711680,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + movl 2(%ebp,%ecx,8),%ecx + andl $4278190080,%ecx + xorl %ecx,%edx + movl %esi,%ecx + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Te: +.long 2774754246,2774754246 +.long 2222750968,2222750968 +.long 2574743534,2574743534 +.long 2373680118,2373680118 +.long 234025727,234025727 +.long 3177933782,3177933782 +.long 2976870366,2976870366 +.long 1422247313,1422247313 +.long 1345335392,1345335392 +.long 50397442,50397442 +.long 2842126286,2842126286 +.long 2099981142,2099981142 +.long 436141799,436141799 +.long 1658312629,1658312629 +.long 3870010189,3870010189 +.long 2591454956,2591454956 +.long 1170918031,1170918031 +.long 2642575903,2642575903 +.long 1086966153,1086966153 +.long 2273148410,2273148410 +.long 368769775,368769775 +.long 3948501426,3948501426 +.long 3376891790,3376891790 +.long 200339707,200339707 +.long 3970805057,3970805057 +.long 1742001331,1742001331 +.long 4255294047,4255294047 +.long 3937382213,3937382213 +.long 3214711843,3214711843 +.long 4154762323,4154762323 +.long 2524082916,2524082916 +.long 1539358875,1539358875 +.long 3266819957,3266819957 +.long 486407649,486407649 +.long 2928907069,2928907069 +.long 1780885068,1780885068 +.long 1513502316,1513502316 +.long 1094664062,1094664062 +.long 49805301,49805301 +.long 1338821763,1338821763 +.long 1546925160,1546925160 +.long 4104496465,4104496465 +.long 887481809,887481809 +.long 150073849,150073849 +.long 2473685474,2473685474 +.long 1943591083,1943591083 +.long 1395732834,1395732834 +.long 1058346282,1058346282 +.long 201589768,201589768 +.long 1388824469,1388824469 +.long 1696801606,1696801606 +.long 1589887901,1589887901 +.long 672667696,672667696 +.long 2711000631,2711000631 +.long 251987210,251987210 +.long 3046808111,3046808111 +.long 151455502,151455502 +.long 907153956,907153956 +.long 2608889883,2608889883 +.long 1038279391,1038279391 +.long 652995533,652995533 +.long 1764173646,1764173646 +.long 3451040383,3451040383 +.long 2675275242,2675275242 +.long 453576978,453576978 +.long 2659418909,2659418909 +.long 1949051992,1949051992 +.long 773462580,773462580 +.long 756751158,756751158 +.long 2993581788,2993581788 +.long 3998898868,3998898868 +.long 4221608027,4221608027 +.long 4132590244,4132590244 +.long 1295727478,1295727478 +.long 1641469623,1641469623 +.long 3467883389,3467883389 +.long 2066295122,2066295122 +.long 1055122397,1055122397 +.long 1898917726,1898917726 +.long 2542044179,2542044179 +.long 4115878822,4115878822 +.long 1758581177,1758581177 +.long 0,0 +.long 753790401,753790401 +.long 1612718144,1612718144 +.long 536673507,536673507 +.long 3367088505,3367088505 +.long 3982187446,3982187446 +.long 3194645204,3194645204 +.long 1187761037,1187761037 +.long 3653156455,3653156455 +.long 1262041458,1262041458 +.long 3729410708,3729410708 +.long 3561770136,3561770136 +.long 3898103984,3898103984 +.long 1255133061,1255133061 +.long 1808847035,1808847035 +.long 720367557,720367557 +.long 3853167183,3853167183 +.long 385612781,385612781 +.long 3309519750,3309519750 +.long 3612167578,3612167578 +.long 1429418854,1429418854 +.long 2491778321,2491778321 +.long 3477423498,3477423498 +.long 284817897,284817897 +.long 100794884,100794884 +.long 2172616702,2172616702 +.long 4031795360,4031795360 +.long 1144798328,1144798328 +.long 3131023141,3131023141 +.long 3819481163,3819481163 +.long 4082192802,4082192802 +.long 4272137053,4272137053 +.long 3225436288,3225436288 +.long 2324664069,2324664069 +.long 2912064063,2912064063 +.long 3164445985,3164445985 +.long 1211644016,1211644016 +.long 83228145,83228145 +.long 3753688163,3753688163 +.long 3249976951,3249976951 +.long 1977277103,1977277103 +.long 1663115586,1663115586 +.long 806359072,806359072 +.long 452984805,452984805 +.long 250868733,250868733 +.long 1842533055,1842533055 +.long 1288555905,1288555905 +.long 336333848,336333848 +.long 890442534,890442534 +.long 804056259,804056259 +.long 3781124030,3781124030 +.long 2727843637,2727843637 +.long 3427026056,3427026056 +.long 957814574,957814574 +.long 1472513171,1472513171 +.long 4071073621,4071073621 +.long 2189328124,2189328124 +.long 1195195770,1195195770 +.long 2892260552,2892260552 +.long 3881655738,3881655738 +.long 723065138,723065138 +.long 2507371494,2507371494 +.long 2690670784,2690670784 +.long 2558624025,2558624025 +.long 3511635870,3511635870 +.long 2145180835,2145180835 +.long 1713513028,1713513028 +.long 2116692564,2116692564 +.long 2878378043,2878378043 +.long 2206763019,2206763019 +.long 3393603212,3393603212 +.long 703524551,703524551 +.long 3552098411,3552098411 +.long 1007948840,1007948840 +.long 2044649127,2044649127 +.long 3797835452,3797835452 +.long 487262998,487262998 +.long 1994120109,1994120109 +.long 1004593371,1004593371 +.long 1446130276,1446130276 +.long 1312438900,1312438900 +.long 503974420,503974420 +.long 3679013266,3679013266 +.long 168166924,168166924 +.long 1814307912,1814307912 +.long 3831258296,3831258296 +.long 1573044895,1573044895 +.long 1859376061,1859376061 +.long 4021070915,4021070915 +.long 2791465668,2791465668 +.long 2828112185,2828112185 +.long 2761266481,2761266481 +.long 937747667,937747667 +.long 2339994098,2339994098 +.long 854058965,854058965 +.long 1137232011,1137232011 +.long 1496790894,1496790894 +.long 3077402074,3077402074 +.long 2358086913,2358086913 +.long 1691735473,1691735473 +.long 3528347292,3528347292 +.long 3769215305,3769215305 +.long 3027004632,3027004632 +.long 4199962284,4199962284 +.long 133494003,133494003 +.long 636152527,636152527 +.long 2942657994,2942657994 +.long 2390391540,2390391540 +.long 3920539207,3920539207 +.long 403179536,403179536 +.long 3585784431,3585784431 +.long 2289596656,2289596656 +.long 1864705354,1864705354 +.long 1915629148,1915629148 +.long 605822008,605822008 +.long 4054230615,4054230615 +.long 3350508659,3350508659 +.long 1371981463,1371981463 +.long 602466507,602466507 +.long 2094914977,2094914977 +.long 2624877800,2624877800 +.long 555687742,555687742 +.long 3712699286,3712699286 +.long 3703422305,3703422305 +.long 2257292045,2257292045 +.long 2240449039,2240449039 +.long 2423288032,2423288032 +.long 1111375484,1111375484 +.long 3300242801,3300242801 +.long 2858837708,2858837708 +.long 3628615824,3628615824 +.long 84083462,84083462 +.long 32962295,32962295 +.long 302911004,302911004 +.long 2741068226,2741068226 +.long 1597322602,1597322602 +.long 4183250862,4183250862 +.long 3501832553,3501832553 +.long 2441512471,2441512471 +.long 1489093017,1489093017 +.long 656219450,656219450 +.long 3114180135,3114180135 +.long 954327513,954327513 +.long 335083755,335083755 +.long 3013122091,3013122091 +.long 856756514,856756514 +.long 3144247762,3144247762 +.long 1893325225,1893325225 +.long 2307821063,2307821063 +.long 2811532339,2811532339 +.long 3063651117,3063651117 +.long 572399164,572399164 +.long 2458355477,2458355477 +.long 552200649,552200649 +.long 1238290055,1238290055 +.long 4283782570,4283782570 +.long 2015897680,2015897680 +.long 2061492133,2061492133 +.long 2408352771,2408352771 +.long 4171342169,4171342169 +.long 2156497161,2156497161 +.long 386731290,386731290 +.long 3669999461,3669999461 +.long 837215959,837215959 +.long 3326231172,3326231172 +.long 3093850320,3093850320 +.long 3275833730,3275833730 +.long 2962856233,2962856233 +.long 1999449434,1999449434 +.long 286199582,286199582 +.long 3417354363,3417354363 +.long 4233385128,4233385128 +.long 3602627437,3602627437 +.long 974525996,974525996 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.byte 99,124,119,123,242,107,111,197 +.byte 48,1,103,43,254,215,171,118 +.byte 202,130,201,125,250,89,71,240 +.byte 173,212,162,175,156,164,114,192 +.byte 183,253,147,38,54,63,247,204 +.byte 52,165,229,241,113,216,49,21 +.byte 4,199,35,195,24,150,5,154 +.byte 7,18,128,226,235,39,178,117 +.byte 9,131,44,26,27,110,90,160 +.byte 82,59,214,179,41,227,47,132 +.byte 83,209,0,237,32,252,177,91 +.byte 106,203,190,57,74,76,88,207 +.byte 208,239,170,251,67,77,51,133 +.byte 69,249,2,127,80,60,159,168 +.byte 81,163,64,143,146,157,56,245 +.byte 188,182,218,33,16,255,243,210 +.byte 205,12,19,236,95,151,68,23 +.byte 196,167,126,61,100,93,25,115 +.byte 96,129,79,220,34,42,144,136 +.byte 70,238,184,20,222,94,11,219 +.byte 224,50,58,10,73,6,36,92 +.byte 194,211,172,98,145,149,228,121 +.byte 231,200,55,109,141,213,78,169 +.byte 108,86,244,234,101,122,174,8 +.byte 186,120,37,46,28,166,180,198 +.byte 232,221,116,31,75,189,139,138 +.byte 112,62,181,102,72,3,246,14 +.byte 97,53,87,185,134,193,29,158 +.byte 225,248,152,17,105,217,142,148 +.byte 155,30,135,233,206,85,40,223 +.byte 140,161,137,13,191,230,66,104 +.byte 65,153,45,15,176,84,187,22 +.long 1,2,4,8 +.long 16,32,64,128 +.long 27,54,0,0 +.long 0,0,0,0 +.size _x86_AES_encrypt,.-_x86_AES_encrypt +.globl AES_encrypt +.type AES_encrypt,@function +.align 16 +AES_encrypt: +.L_AES_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L004pic_point +.L004pic_point: + popl %ebp + leal OPENSSL_ia32cap_P,%eax + leal .LAES_Te-.L004pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L005x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L005x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_encrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_encrypt,.-.L_AES_encrypt_begin +.type _x86_AES_decrypt_compact,@function +.align 16 +_x86_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi +.align 16 +.L006loop: + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%eax + subl %edi,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %ecx,%eax + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ecx,%ebx + roll $8,%ecx + xorl %esi,%ebp + xorl %eax,%ecx + xorl %ebp,%eax + xorl %ebx,%ecx + xorl %ebp,%ebx + roll $24,%eax + xorl %ebp,%ecx + roll $16,%ebx + xorl %eax,%ecx + roll $8,%ebp + xorl %ebx,%ecx + movl 4(%esp),%eax + xorl %ebp,%ecx + movl %ecx,12(%esp) + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebx + subl %edi,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %edx,%ebx + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %edx,%ecx + roll $8,%edx + xorl %esi,%ebp + xorl %ebx,%edx + xorl %ebp,%ebx + xorl %ecx,%edx + xorl %ebp,%ecx + roll $24,%ebx + xorl %ebp,%edx + roll $16,%ecx + xorl %ebx,%edx + roll $8,%ebp + xorl %ecx,%edx + movl 8(%esp),%ebx + xorl %ebp,%edx + movl %edx,16(%esp) + movl $2155905152,%edi + andl %eax,%edi + movl %edi,%esi + shrl $7,%edi + leal (%eax,%eax,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %eax,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %eax,%edx + roll $8,%eax + xorl %esi,%ebp + xorl %ecx,%eax + xorl %ebp,%ecx + xorl %edx,%eax + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%eax + roll $16,%edx + xorl %ecx,%eax + roll $8,%ebp + xorl %edx,%eax + xorl %ebp,%eax + movl $2155905152,%edi + andl %ebx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ebx,%ebx,1),%ecx + subl %edi,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%edi + andl %ecx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%ecx,%ecx,1),%edx + subl %edi,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%edi + andl %edx,%edi + movl %edi,%esi + shrl $7,%edi + leal (%edx,%edx,1),%ebp + subl %edi,%esi + andl $4278124286,%ebp + andl $454761243,%esi + xorl %ebx,%edx + roll $8,%ebx + xorl %esi,%ebp + xorl %ecx,%ebx + xorl %ebp,%ecx + xorl %edx,%ebx + xorl %ebp,%edx + roll $24,%ecx + xorl %ebp,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%ebp + xorl %edx,%ebx + movl 12(%esp),%ecx + xorl %ebp,%ebx + movl 16(%esp),%edx + movl 20(%esp),%edi + movl 28(%esp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L006loop + movl %eax,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl -128(%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl -128(%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl -128(%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl -128(%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl -128(%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl -128(%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + xorl 16(%edi),%eax + xorl 20(%edi),%ebx + xorl 24(%edi),%ecx + xorl 28(%edi),%edx + ret +.size _x86_AES_decrypt_compact,.-_x86_AES_decrypt_compact +.type _sse_AES_decrypt_compact,@function +.align 16 +_sse_AES_decrypt_compact: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) + movl $454761243,%eax + movl %eax,8(%esp) + movl %eax,12(%esp) + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx +.align 16 +.L007loop: + pshufw $12,%mm0,%mm1 + pshufw $9,%mm4,%mm5 + movd %mm1,%eax + movd %mm5,%ebx + movl %edi,20(%esp) + movzbl %al,%esi + movzbl %ah,%edx + pshufw $6,%mm0,%mm2 + movzbl -128(%ebp,%esi,1),%ecx + movzbl %bl,%edi + movzbl -128(%ebp,%edx,1),%edx + shrl $16,%eax + shll $8,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $16,%esi + pshufw $3,%mm4,%mm6 + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $24,%esi + shrl $16,%ebx + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shll $24,%esi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movd %mm2,%eax + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + shll $16,%esi + movd %mm6,%ebx + movd %ecx,%mm0 + movzbl -128(%ebp,%edi,1),%ecx + movzbl %al,%edi + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bl,%edi + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %ah,%edi + shll $16,%esi + shrl $16,%eax + orl %esi,%edx + movzbl -128(%ebp,%edi,1),%esi + movzbl %bh,%edi + shrl $16,%ebx + shll $8,%esi + movd %edx,%mm1 + movzbl -128(%ebp,%edi,1),%edx + movzbl %bh,%edi + shll $24,%edx + andl $255,%ebx + orl %esi,%edx + punpckldq %mm1,%mm0 + movzbl -128(%ebp,%edi,1),%esi + movzbl %al,%edi + shll $8,%esi + movzbl %ah,%eax + movzbl -128(%ebp,%ebx,1),%ebx + orl %esi,%ecx + movzbl -128(%ebp,%edi,1),%esi + orl %ebx,%edx + shll $16,%esi + movzbl -128(%ebp,%eax,1),%eax + orl %esi,%edx + shll $24,%eax + orl %eax,%ecx + movl 20(%esp),%edi + movd %edx,%mm4 + movd %ecx,%mm5 + punpckldq %mm5,%mm4 + addl $16,%edi + cmpl 24(%esp),%edi + ja .L008out + movq %mm0,%mm3 + movq %mm4,%mm7 + pshufw $228,%mm0,%mm2 + pshufw $228,%mm4,%mm6 + movq %mm0,%mm1 + movq %mm4,%mm5 + pshufw $177,%mm0,%mm0 + pshufw $177,%mm4,%mm4 + pslld $8,%mm2 + pslld $8,%mm6 + psrld $8,%mm3 + psrld $8,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pslld $16,%mm2 + pslld $16,%mm6 + psrld $16,%mm3 + psrld $16,%mm7 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movq 8(%esp),%mm3 + pxor %mm2,%mm2 + pxor %mm6,%mm6 + pcmpgtb %mm1,%mm2 + pcmpgtb %mm5,%mm6 + pand %mm3,%mm2 + pand %mm3,%mm6 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm2,%mm1 + pxor %mm6,%mm5 + movq %mm1,%mm3 + movq %mm5,%mm7 + movq %mm1,%mm2 + movq %mm5,%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pslld $24,%mm3 + pslld $24,%mm7 + psrld $8,%mm2 + psrld $8,%mm6 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + movq 8(%esp),%mm2 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pshufw $177,%mm1,%mm3 + pshufw $177,%mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm3,%mm3 + pxor %mm7,%mm7 + pcmpgtb %mm1,%mm3 + pcmpgtb %mm5,%mm7 + pand %mm2,%mm3 + pand %mm2,%mm7 + paddb %mm1,%mm1 + paddb %mm5,%mm5 + pxor %mm3,%mm1 + pxor %mm7,%mm5 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movq %mm1,%mm3 + movq %mm5,%mm7 + pshufw $177,%mm1,%mm2 + pshufw $177,%mm5,%mm6 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + pslld $8,%mm1 + pslld $8,%mm5 + psrld $8,%mm3 + psrld $8,%mm7 + movq (%edi),%mm2 + movq 8(%edi),%mm6 + pxor %mm1,%mm0 + pxor %mm5,%mm4 + pxor %mm3,%mm0 + pxor %mm7,%mm4 + movl -128(%ebp),%eax + pslld $16,%mm1 + pslld $16,%mm5 + movl -64(%ebp),%ebx + psrld $16,%mm3 + psrld $16,%mm7 + movl (%ebp),%ecx + pxor %mm1,%mm0 + pxor %mm5,%mm4 + movl 64(%ebp),%edx + pxor %mm3,%mm0 + pxor %mm7,%mm4 + pxor %mm2,%mm0 + pxor %mm6,%mm4 + jmp .L007loop +.align 16 +.L008out: + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + ret +.size _sse_AES_decrypt_compact,.-_sse_AES_decrypt_compact +.type _x86_AES_decrypt,@function +.align 16 +_x86_AES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + movl %edi,20(%esp) + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,24(%esp) +.align 16 +.L009loop: + movl %eax,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %dh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ebx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,4(%esp) + + movl %ebx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %ah,%edi + xorl 3(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %ecx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + movl %esi,8(%esp) + + movl %ecx,%esi + andl $255,%esi + movl (%ebp,%esi,8),%esi + movzbl %bh,%edi + xorl 3(%ebp,%edi,8),%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + xorl 2(%ebp,%edi,8),%esi + movl %edx,%edi + shrl $24,%edi + xorl 1(%ebp,%edi,8),%esi + + movl 20(%esp),%edi + andl $255,%edx + movl (%ebp,%edx,8),%edx + movzbl %ch,%ecx + xorl 3(%ebp,%ecx,8),%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + xorl 2(%ebp,%ebx,8),%edx + movl 8(%esp),%ebx + shrl $24,%eax + xorl 1(%ebp,%eax,8),%edx + movl 4(%esp),%eax + + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + cmpl 24(%esp),%edi + movl %edi,20(%esp) + jb .L009loop + leal 2176(%ebp),%ebp + movl -128(%ebp),%edi + movl -96(%ebp),%esi + movl -64(%ebp),%edi + movl -32(%ebp),%esi + movl (%ebp),%edi + movl 32(%ebp),%esi + movl 64(%ebp),%edi + movl 96(%ebp),%esi + leal -128(%ebp),%ebp + movl %eax,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %dh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ebx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,4(%esp) + movl %ebx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %ah,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %ecx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl %esi,8(%esp) + movl %ecx,%esi + andl $255,%esi + movzbl (%ebp,%esi,1),%esi + movzbl %bh,%edi + movzbl (%ebp,%edi,1),%edi + shll $8,%edi + xorl %edi,%esi + movl %eax,%edi + shrl $16,%edi + andl $255,%edi + movzbl (%ebp,%edi,1),%edi + shll $16,%edi + xorl %edi,%esi + movl %edx,%edi + shrl $24,%edi + movzbl (%ebp,%edi,1),%edi + shll $24,%edi + xorl %edi,%esi + movl 20(%esp),%edi + andl $255,%edx + movzbl (%ebp,%edx,1),%edx + movzbl %ch,%ecx + movzbl (%ebp,%ecx,1),%ecx + shll $8,%ecx + xorl %ecx,%edx + movl %esi,%ecx + shrl $16,%ebx + andl $255,%ebx + movzbl (%ebp,%ebx,1),%ebx + shll $16,%ebx + xorl %ebx,%edx + movl 8(%esp),%ebx + shrl $24,%eax + movzbl (%ebp,%eax,1),%eax + shll $24,%eax + xorl %eax,%edx + movl 4(%esp),%eax + leal -2048(%ebp),%ebp + addl $16,%edi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + ret +.align 64 +.LAES_Td: +.long 1353184337,1353184337 +.long 1399144830,1399144830 +.long 3282310938,3282310938 +.long 2522752826,2522752826 +.long 3412831035,3412831035 +.long 4047871263,4047871263 +.long 2874735276,2874735276 +.long 2466505547,2466505547 +.long 1442459680,1442459680 +.long 4134368941,4134368941 +.long 2440481928,2440481928 +.long 625738485,625738485 +.long 4242007375,4242007375 +.long 3620416197,3620416197 +.long 2151953702,2151953702 +.long 2409849525,2409849525 +.long 1230680542,1230680542 +.long 1729870373,1729870373 +.long 2551114309,2551114309 +.long 3787521629,3787521629 +.long 41234371,41234371 +.long 317738113,317738113 +.long 2744600205,2744600205 +.long 3338261355,3338261355 +.long 3881799427,3881799427 +.long 2510066197,2510066197 +.long 3950669247,3950669247 +.long 3663286933,3663286933 +.long 763608788,763608788 +.long 3542185048,3542185048 +.long 694804553,694804553 +.long 1154009486,1154009486 +.long 1787413109,1787413109 +.long 2021232372,2021232372 +.long 1799248025,1799248025 +.long 3715217703,3715217703 +.long 3058688446,3058688446 +.long 397248752,397248752 +.long 1722556617,1722556617 +.long 3023752829,3023752829 +.long 407560035,407560035 +.long 2184256229,2184256229 +.long 1613975959,1613975959 +.long 1165972322,1165972322 +.long 3765920945,3765920945 +.long 2226023355,2226023355 +.long 480281086,480281086 +.long 2485848313,2485848313 +.long 1483229296,1483229296 +.long 436028815,436028815 +.long 2272059028,2272059028 +.long 3086515026,3086515026 +.long 601060267,601060267 +.long 3791801202,3791801202 +.long 1468997603,1468997603 +.long 715871590,715871590 +.long 120122290,120122290 +.long 63092015,63092015 +.long 2591802758,2591802758 +.long 2768779219,2768779219 +.long 4068943920,4068943920 +.long 2997206819,2997206819 +.long 3127509762,3127509762 +.long 1552029421,1552029421 +.long 723308426,723308426 +.long 2461301159,2461301159 +.long 4042393587,4042393587 +.long 2715969870,2715969870 +.long 3455375973,3455375973 +.long 3586000134,3586000134 +.long 526529745,526529745 +.long 2331944644,2331944644 +.long 2639474228,2639474228 +.long 2689987490,2689987490 +.long 853641733,853641733 +.long 1978398372,1978398372 +.long 971801355,971801355 +.long 2867814464,2867814464 +.long 111112542,111112542 +.long 1360031421,1360031421 +.long 4186579262,4186579262 +.long 1023860118,1023860118 +.long 2919579357,2919579357 +.long 1186850381,1186850381 +.long 3045938321,3045938321 +.long 90031217,90031217 +.long 1876166148,1876166148 +.long 4279586912,4279586912 +.long 620468249,620468249 +.long 2548678102,2548678102 +.long 3426959497,3426959497 +.long 2006899047,2006899047 +.long 3175278768,3175278768 +.long 2290845959,2290845959 +.long 945494503,945494503 +.long 3689859193,3689859193 +.long 1191869601,1191869601 +.long 3910091388,3910091388 +.long 3374220536,3374220536 +.long 0,0 +.long 2206629897,2206629897 +.long 1223502642,1223502642 +.long 2893025566,2893025566 +.long 1316117100,1316117100 +.long 4227796733,4227796733 +.long 1446544655,1446544655 +.long 517320253,517320253 +.long 658058550,658058550 +.long 1691946762,1691946762 +.long 564550760,564550760 +.long 3511966619,3511966619 +.long 976107044,976107044 +.long 2976320012,2976320012 +.long 266819475,266819475 +.long 3533106868,3533106868 +.long 2660342555,2660342555 +.long 1338359936,1338359936 +.long 2720062561,2720062561 +.long 1766553434,1766553434 +.long 370807324,370807324 +.long 179999714,179999714 +.long 3844776128,3844776128 +.long 1138762300,1138762300 +.long 488053522,488053522 +.long 185403662,185403662 +.long 2915535858,2915535858 +.long 3114841645,3114841645 +.long 3366526484,3366526484 +.long 2233069911,2233069911 +.long 1275557295,1275557295 +.long 3151862254,3151862254 +.long 4250959779,4250959779 +.long 2670068215,2670068215 +.long 3170202204,3170202204 +.long 3309004356,3309004356 +.long 880737115,880737115 +.long 1982415755,1982415755 +.long 3703972811,3703972811 +.long 1761406390,1761406390 +.long 1676797112,1676797112 +.long 3403428311,3403428311 +.long 277177154,277177154 +.long 1076008723,1076008723 +.long 538035844,538035844 +.long 2099530373,2099530373 +.long 4164795346,4164795346 +.long 288553390,288553390 +.long 1839278535,1839278535 +.long 1261411869,1261411869 +.long 4080055004,4080055004 +.long 3964831245,3964831245 +.long 3504587127,3504587127 +.long 1813426987,1813426987 +.long 2579067049,2579067049 +.long 4199060497,4199060497 +.long 577038663,577038663 +.long 3297574056,3297574056 +.long 440397984,440397984 +.long 3626794326,3626794326 +.long 4019204898,4019204898 +.long 3343796615,3343796615 +.long 3251714265,3251714265 +.long 4272081548,4272081548 +.long 906744984,906744984 +.long 3481400742,3481400742 +.long 685669029,685669029 +.long 646887386,646887386 +.long 2764025151,2764025151 +.long 3835509292,3835509292 +.long 227702864,227702864 +.long 2613862250,2613862250 +.long 1648787028,1648787028 +.long 3256061430,3256061430 +.long 3904428176,3904428176 +.long 1593260334,1593260334 +.long 4121936770,4121936770 +.long 3196083615,3196083615 +.long 2090061929,2090061929 +.long 2838353263,2838353263 +.long 3004310991,3004310991 +.long 999926984,999926984 +.long 2809993232,2809993232 +.long 1852021992,1852021992 +.long 2075868123,2075868123 +.long 158869197,158869197 +.long 4095236462,4095236462 +.long 28809964,28809964 +.long 2828685187,2828685187 +.long 1701746150,1701746150 +.long 2129067946,2129067946 +.long 147831841,147831841 +.long 3873969647,3873969647 +.long 3650873274,3650873274 +.long 3459673930,3459673930 +.long 3557400554,3557400554 +.long 3598495785,3598495785 +.long 2947720241,2947720241 +.long 824393514,824393514 +.long 815048134,815048134 +.long 3227951669,3227951669 +.long 935087732,935087732 +.long 2798289660,2798289660 +.long 2966458592,2966458592 +.long 366520115,366520115 +.long 1251476721,1251476721 +.long 4158319681,4158319681 +.long 240176511,240176511 +.long 804688151,804688151 +.long 2379631990,2379631990 +.long 1303441219,1303441219 +.long 1414376140,1414376140 +.long 3741619940,3741619940 +.long 3820343710,3820343710 +.long 461924940,461924940 +.long 3089050817,3089050817 +.long 2136040774,2136040774 +.long 82468509,82468509 +.long 1563790337,1563790337 +.long 1937016826,1937016826 +.long 776014843,776014843 +.long 1511876531,1511876531 +.long 1389550482,1389550482 +.long 861278441,861278441 +.long 323475053,323475053 +.long 2355222426,2355222426 +.long 2047648055,2047648055 +.long 2383738969,2383738969 +.long 2302415851,2302415851 +.long 3995576782,3995576782 +.long 902390199,902390199 +.long 3991215329,3991215329 +.long 1018251130,1018251130 +.long 1507840668,1507840668 +.long 1064563285,1064563285 +.long 2043548696,2043548696 +.long 3208103795,3208103795 +.long 3939366739,3939366739 +.long 1537932639,1537932639 +.long 342834655,342834655 +.long 2262516856,2262516856 +.long 2180231114,2180231114 +.long 1053059257,1053059257 +.long 741614648,741614648 +.long 1598071746,1598071746 +.long 1925389590,1925389590 +.long 203809468,203809468 +.long 2336832552,2336832552 +.long 1100287487,1100287487 +.long 1895934009,1895934009 +.long 3736275976,3736275976 +.long 2632234200,2632234200 +.long 2428589668,2428589668 +.long 1636092795,1636092795 +.long 1890988757,1890988757 +.long 1952214088,1952214088 +.long 1113045200,1113045200 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.byte 82,9,106,213,48,54,165,56 +.byte 191,64,163,158,129,243,215,251 +.byte 124,227,57,130,155,47,255,135 +.byte 52,142,67,68,196,222,233,203 +.byte 84,123,148,50,166,194,35,61 +.byte 238,76,149,11,66,250,195,78 +.byte 8,46,161,102,40,217,36,178 +.byte 118,91,162,73,109,139,209,37 +.byte 114,248,246,100,134,104,152,22 +.byte 212,164,92,204,93,101,182,146 +.byte 108,112,72,80,253,237,185,218 +.byte 94,21,70,87,167,141,157,132 +.byte 144,216,171,0,140,188,211,10 +.byte 247,228,88,5,184,179,69,6 +.byte 208,44,30,143,202,63,15,2 +.byte 193,175,189,3,1,19,138,107 +.byte 58,145,17,65,79,103,220,234 +.byte 151,242,207,206,240,180,230,115 +.byte 150,172,116,34,231,173,53,133 +.byte 226,249,55,232,28,117,223,110 +.byte 71,241,26,113,29,41,197,137 +.byte 111,183,98,14,170,24,190,27 +.byte 252,86,62,75,198,210,121,32 +.byte 154,219,192,254,120,205,90,244 +.byte 31,221,168,51,136,7,199,49 +.byte 177,18,16,89,39,128,236,95 +.byte 96,81,127,169,25,181,74,13 +.byte 45,229,122,159,147,201,156,239 +.byte 160,224,59,77,174,42,245,176 +.byte 200,235,187,60,131,83,153,97 +.byte 23,43,4,126,186,119,214,38 +.byte 225,105,20,99,85,33,12,125 +.size _x86_AES_decrypt,.-_x86_AES_decrypt +.globl AES_decrypt +.type AES_decrypt,@function +.align 16 +AES_decrypt: +.L_AES_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 28(%esp),%edi + movl %esp,%eax + subl $36,%esp + andl $-64,%esp + leal -127(%edi),%ebx + subl %esp,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esp + addl $4,%esp + movl %eax,28(%esp) + call .L010pic_point +.L010pic_point: + popl %ebp + leal OPENSSL_ia32cap_P,%eax + leal .LAES_Td-.L010pic_point(%ebp),%ebp + leal 764(%esp),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + btl $25,(%eax) + jnc .L011x86 + movq (%esi),%mm0 + movq 8(%esi),%mm4 + call _sse_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 16 +.L011x86: + movl %ebp,24(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + call _x86_AES_decrypt_compact + movl 28(%esp),%esp + movl 24(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_decrypt,.-.L_AES_decrypt_begin +.globl AES_cbc_encrypt +.type AES_cbc_encrypt,@function +.align 16 +AES_cbc_encrypt: +.L_AES_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%ecx + cmpl $0,%ecx + je .L012drop_out + call .L013pic_point +.L013pic_point: + popl %ebp + leal OPENSSL_ia32cap_P,%eax + cmpl $0,40(%esp) + leal .LAES_Te-.L013pic_point(%ebp),%ebp + jne .L014picked_te + leal .LAES_Td-.LAES_Te(%ebp),%ebp +.L014picked_te: + pushfl + cld + cmpl $512,%ecx + jb .L015slow_way + testl $15,%ecx + jnz .L015slow_way + btl $28,(%eax) + jc .L015slow_way + leal -324(%esp),%esi + andl $-64,%esi + movl %ebp,%eax + leal 2304(%ebp),%ebx + movl %esi,%edx + andl $4095,%eax + andl $4095,%ebx + andl $4095,%edx + cmpl %ebx,%edx + jb .L016tbl_break_out + subl %ebx,%edx + subl %edx,%esi + jmp .L017tbl_ok +.align 4 +.L016tbl_break_out: + subl %eax,%edx + andl $4095,%edx + addl $384,%edx + subl %edx,%esi +.align 4 +.L017tbl_ok: + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 12(%edx),%edi + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl $0,316(%esp) + movl %edi,%ebx + movl $61,%ecx + subl %ebp,%ebx + movl %edi,%esi + andl $4095,%ebx + leal 76(%esp),%edi + cmpl $2304,%ebx + jb .L018do_copy + cmpl $3852,%ebx + jb .L019skip_copy +.align 4 +.L018do_copy: + movl %edi,44(%esp) +.long 2784229001 +.L019skip_copy: + movl $16,%edi +.align 4 +.L020prefetch_tbl: + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%esi + leal 128(%ebp),%ebp + subl $1,%edi + jnz .L020prefetch_tbl + subl $2048,%ebp + movl 32(%esp),%esi + movl 48(%esp),%edi + cmpl $0,%edx + je .L021fast_decrypt + movl (%edi),%eax + movl 4(%edi),%ebx +.align 16 +.L022fast_enc_loop: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + leal 16(%esi),%esi + movl 40(%esp),%ecx + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L022fast_enc_loop + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L023skip_ezero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L023skip_ezero: + movl 28(%esp),%esp + popfl +.L012drop_out: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L021fast_decrypt: + cmpl 36(%esp),%esi + je .L024fast_dec_in_place + movl %edi,52(%esp) +.align 4 +.align 16 +.L025fast_dec_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 52(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl 36(%esp),%edi + movl 32(%esp),%esi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl %esi,52(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edi + movl %edi,36(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L025fast_dec_loop + movl 52(%esp),%edi + movl 48(%esp),%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + jmp .L026fast_dec_out +.align 16 +.L024fast_dec_in_place: +.L027fast_dec_in_place_loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt + movl 48(%esp),%edi + movl 36(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + subl $16,%ecx + movl %ecx,40(%esp) + jnz .L027fast_dec_in_place_loop +.align 4 +.L026fast_dec_out: + cmpl $0,316(%esp) + movl 44(%esp),%edi + je .L028skip_dzero + movl $60,%ecx + xorl %eax,%eax +.align 4 +.long 2884892297 +.L028skip_dzero: + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L015slow_way: + movl (%eax),%eax + movl 36(%esp),%edi + leal -80(%esp),%esi + andl $-64,%esi + leal -143(%edi),%ebx + subl %esi,%ebx + negl %ebx + andl $960,%ebx + subl %ebx,%esi + leal 768(%esi),%ebx + subl %ebp,%ebx + andl $768,%ebx + leal 2176(%ebp,%ebx,1),%ebp + leal 24(%esp),%edx + xchgl %esi,%esp + addl $4,%esp + movl %ebp,24(%esp) + movl %esi,28(%esp) + movl %eax,52(%esp) + movl (%edx),%eax + movl 4(%edx),%ebx + movl 16(%edx),%esi + movl 20(%edx),%edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edi,44(%esp) + movl %esi,48(%esp) + movl %esi,%edi + movl %eax,%esi + cmpl $0,%edx + je .L029slow_decrypt + cmpl $16,%ecx + movl %ebx,%edx + jb .L030slow_enc_tail + btl $25,52(%esp) + jnc .L031slow_enc_x86 + movq (%edi),%mm0 + movq 8(%edi),%mm4 +.align 16 +.L032slow_enc_loop_sse: + pxor (%esi),%mm0 + pxor 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl 40(%esp),%ecx + movq %mm0,(%edi) + movq %mm4,8(%edi) + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L032slow_enc_loop_sse + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movq %mm0,(%esi) + movq %mm4,8(%esi) + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L031slow_enc_x86: + movl (%edi),%eax + movl 4(%edi),%ebx +.align 4 +.L033slow_enc_loop_x86: + movl 8(%edi),%ecx + movl 12(%edi),%edx + xorl (%esi),%eax + xorl 4(%esi),%ebx + xorl 8(%esi),%ecx + xorl 12(%esi),%edx + movl 44(%esp),%edi + call _x86_AES_encrypt_compact + movl 32(%esp),%esi + movl 36(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + leal 16(%esi),%esi + movl %esi,32(%esp) + leal 16(%edi),%edx + movl %edx,36(%esp) + subl $16,%ecx + cmpl $16,%ecx + movl %ecx,40(%esp) + jae .L033slow_enc_loop_x86 + testl $15,%ecx + jnz .L030slow_enc_tail + movl 48(%esp),%esi + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L030slow_enc_tail: + emms + movl %edx,%edi + movl $16,%ebx + subl %ecx,%ebx + cmpl %esi,%edi + je .L034enc_in_place +.align 4 +.long 2767451785 + jmp .L035enc_skip_in_place +.L034enc_in_place: + leal (%edi,%ecx,1),%edi +.L035enc_skip_in_place: + movl %ebx,%ecx + xorl %eax,%eax +.align 4 +.long 2868115081 + movl 48(%esp),%edi + movl %edx,%esi + movl (%edi),%eax + movl 4(%edi),%ebx + movl $16,40(%esp) + jmp .L033slow_enc_loop_x86 +.align 16 +.L029slow_decrypt: + btl $25,52(%esp) + jnc .L036slow_dec_loop_x86 +.align 4 +.L037slow_dec_loop_sse: + movq (%esi),%mm0 + movq 8(%esi),%mm4 + movl 44(%esp),%edi + call _sse_AES_decrypt_compact + movl 32(%esp),%esi + leal 60(%esp),%eax + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl 48(%esp),%edi + movq (%esi),%mm1 + movq 8(%esi),%mm5 + pxor (%edi),%mm0 + pxor 8(%edi),%mm4 + movq %mm1,(%edi) + movq %mm5,8(%edi) + subl $16,%ecx + jc .L038slow_dec_partial_sse + movq %mm0,(%ebx) + movq %mm4,8(%ebx) + leal 16(%ebx),%ebx + movl %ebx,36(%esp) + leal 16(%esi),%esi + movl %esi,32(%esp) + movl %ecx,40(%esp) + jnz .L037slow_dec_loop_sse + emms + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L038slow_dec_partial_sse: + movq %mm0,(%eax) + movq %mm4,8(%eax) + emms + addl $16,%ecx + movl %ebx,%edi + movl %eax,%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L036slow_dec_loop_x86: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + leal 60(%esp),%edi + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 44(%esp),%edi + call _x86_AES_decrypt_compact + movl 48(%esp),%edi + movl 40(%esp),%esi + xorl (%edi),%eax + xorl 4(%edi),%ebx + xorl 8(%edi),%ecx + xorl 12(%edi),%edx + subl $16,%esi + jc .L039slow_dec_partial_x86 + movl %esi,40(%esp) + movl 36(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + leal 16(%esi),%esi + movl %esi,36(%esp) + leal 60(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 32(%esp),%esi + leal 16(%esi),%esi + movl %esi,32(%esp) + jnz .L036slow_dec_loop_x86 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + pushfl +.align 16 +.L039slow_dec_partial_x86: + leal 60(%esp),%esi + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + movl 32(%esp),%esi + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 40(%esp),%ecx + movl 36(%esp),%edi + leal 60(%esp),%esi +.align 4 +.long 2767451785 + movl 28(%esp),%esp + popfl + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_cbc_encrypt,.-.L_AES_cbc_encrypt_begin +.type _x86_AES_set_encrypt_key,@function +.align 16 +_x86_AES_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%esi + movl 32(%esp),%edi + testl $-1,%esi + jz .L040badpointer + testl $-1,%edi + jz .L040badpointer + call .L041pic_point +.L041pic_point: + popl %ebp + leal .LAES_Te-.L041pic_point(%ebp),%ebp + leal 2176(%ebp),%ebp + movl -128(%ebp),%eax + movl -96(%ebp),%ebx + movl -64(%ebp),%ecx + movl -32(%ebp),%edx + movl (%ebp),%eax + movl 32(%ebp),%ebx + movl 64(%ebp),%ecx + movl 96(%ebp),%edx + movl 28(%esp),%ecx + cmpl $128,%ecx + je .L04210rounds + cmpl $192,%ecx + je .L04312rounds + cmpl $256,%ecx + je .L04414rounds + movl $-2,%eax + jmp .L045exit +.L04210rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + xorl %ecx,%ecx + jmp .L04610shortcut +.align 4 +.L04710loop: + movl (%edi),%eax + movl 12(%edi),%edx +.L04610shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,16(%edi) + xorl 4(%edi),%eax + movl %eax,20(%edi) + xorl 8(%edi),%eax + movl %eax,24(%edi) + xorl 12(%edi),%eax + movl %eax,28(%edi) + incl %ecx + addl $16,%edi + cmpl $10,%ecx + jl .L04710loop + movl $10,80(%edi) + xorl %eax,%eax + jmp .L045exit +.L04312rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%ecx + movl 20(%esi),%edx + movl %ecx,16(%edi) + movl %edx,20(%edi) + xorl %ecx,%ecx + jmp .L04812shortcut +.align 4 +.L04912loop: + movl (%edi),%eax + movl 20(%edi),%edx +.L04812shortcut: + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,24(%edi) + xorl 4(%edi),%eax + movl %eax,28(%edi) + xorl 8(%edi),%eax + movl %eax,32(%edi) + xorl 12(%edi),%eax + movl %eax,36(%edi) + cmpl $7,%ecx + je .L05012break + incl %ecx + xorl 16(%edi),%eax + movl %eax,40(%edi) + xorl 20(%edi),%eax + movl %eax,44(%edi) + addl $24,%edi + jmp .L04912loop +.L05012break: + movl $12,72(%edi) + xorl %eax,%eax + jmp .L045exit +.L04414rounds: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,8(%edi) + movl %edx,12(%edi) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,16(%edi) + movl %ebx,20(%edi) + movl %ecx,24(%edi) + movl %edx,28(%edi) + xorl %ecx,%ecx + jmp .L05114shortcut +.align 4 +.L05214loop: + movl 28(%edi),%edx +.L05114shortcut: + movl (%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $24,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $8,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $16,%ebx + xorl %ebx,%eax + xorl 896(%ebp,%ecx,4),%eax + movl %eax,32(%edi) + xorl 4(%edi),%eax + movl %eax,36(%edi) + xorl 8(%edi),%eax + movl %eax,40(%edi) + xorl 12(%edi),%eax + movl %eax,44(%edi) + cmpl $6,%ecx + je .L05314break + incl %ecx + movl %eax,%edx + movl 16(%edi),%eax + movzbl %dl,%esi + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shrl $16,%edx + shll $8,%ebx + movzbl %dl,%esi + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + movzbl %dh,%esi + shll $16,%ebx + xorl %ebx,%eax + movzbl -128(%ebp,%esi,1),%ebx + shll $24,%ebx + xorl %ebx,%eax + movl %eax,48(%edi) + xorl 20(%edi),%eax + movl %eax,52(%edi) + xorl 24(%edi),%eax + movl %eax,56(%edi) + xorl 28(%edi),%eax + movl %eax,60(%edi) + addl $32,%edi + jmp .L05214loop +.L05314break: + movl $14,48(%edi) + xorl %eax,%eax + jmp .L045exit +.L040badpointer: + movl $-1,%eax +.L045exit: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _x86_AES_set_encrypt_key,.-_x86_AES_set_encrypt_key +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,@function +.align 16 +AES_set_encrypt_key: +.L_AES_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + ret +.size AES_set_encrypt_key,.-.L_AES_set_encrypt_key_begin +.globl AES_set_decrypt_key +.type AES_set_decrypt_key,@function +.align 16 +AES_set_decrypt_key: +.L_AES_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + + call _x86_AES_set_encrypt_key + cmpl $0,%eax + je .L054proceed + ret +.L054proceed: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 28(%esp),%esi + movl 240(%esi),%ecx + leal (,%ecx,4),%ecx + leal (%esi,%ecx,4),%edi +.align 4 +.L055invert: + movl (%esi),%eax + movl 4(%esi),%ebx + movl (%edi),%ecx + movl 4(%edi),%edx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl %ecx,(%esi) + movl %edx,4(%esi) + movl 8(%esi),%eax + movl 12(%esi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl %eax,8(%edi) + movl %ebx,12(%edi) + movl %ecx,8(%esi) + movl %edx,12(%esi) + addl $16,%esi + subl $16,%edi + cmpl %edi,%esi + jne .L055invert + movl 28(%esp),%edi + movl 240(%edi),%esi + leal -2(%esi,%esi,1),%esi + leal (%edi,%esi,8),%esi + movl %esi,28(%esp) + movl 16(%edi),%eax +.align 4 +.L056permute: + addl $16,%edi + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %eax,%ebx + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + xorl %eax,%ecx + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + roll $8,%eax + xorl %esi,%edx + movl 4(%edi),%ebp + xorl %ebx,%eax + xorl %edx,%ebx + xorl %ecx,%eax + roll $24,%ebx + xorl %edx,%ecx + xorl %edx,%eax + roll $16,%ecx + xorl %ebx,%eax + roll $8,%edx + xorl %ecx,%eax + movl %ebp,%ebx + xorl %edx,%eax + movl %eax,(%edi) + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + xorl %esi,%ecx + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %ebx,%ecx + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + xorl %ebx,%edx + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + roll $8,%ebx + xorl %esi,%eax + movl 8(%edi),%ebp + xorl %ecx,%ebx + xorl %eax,%ecx + xorl %edx,%ebx + roll $24,%ecx + xorl %eax,%edx + xorl %eax,%ebx + roll $16,%edx + xorl %ecx,%ebx + roll $8,%eax + xorl %edx,%ebx + movl %ebp,%ecx + xorl %eax,%ebx + movl %ebx,4(%edi) + movl $2155905152,%ebp + andl %ecx,%ebp + leal (%ecx,%ecx,1),%edx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%edx + andl $454761243,%esi + xorl %esi,%edx + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %ecx,%edx + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + xorl %ecx,%eax + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + roll $8,%ecx + xorl %esi,%ebx + movl 12(%edi),%ebp + xorl %edx,%ecx + xorl %ebx,%edx + xorl %eax,%ecx + roll $24,%edx + xorl %ebx,%eax + xorl %ebx,%ecx + roll $16,%eax + xorl %edx,%ecx + roll $8,%ebx + xorl %eax,%ecx + movl %ebp,%edx + xorl %ebx,%ecx + movl %ecx,8(%edi) + movl $2155905152,%ebp + andl %edx,%ebp + leal (%edx,%edx,1),%eax + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%eax + andl $454761243,%esi + xorl %esi,%eax + movl $2155905152,%ebp + andl %eax,%ebp + leal (%eax,%eax,1),%ebx + movl %ebp,%esi + shrl $7,%ebp + subl %ebp,%esi + andl $4278124286,%ebx + andl $454761243,%esi + xorl %edx,%eax + xorl %esi,%ebx + movl $2155905152,%ebp + andl %ebx,%ebp + leal (%ebx,%ebx,1),%ecx + movl %ebp,%esi + shrl $7,%ebp + xorl %edx,%ebx + subl %ebp,%esi + andl $4278124286,%ecx + andl $454761243,%esi + roll $8,%edx + xorl %esi,%ecx + movl 16(%edi),%ebp + xorl %eax,%edx + xorl %ecx,%eax + xorl %ebx,%edx + roll $24,%eax + xorl %ecx,%ebx + xorl %ecx,%edx + roll $16,%ebx + xorl %eax,%edx + roll $8,%ecx + xorl %ebx,%edx + movl %ebp,%eax + xorl %ecx,%edx + movl %edx,12(%edi) + cmpl 28(%esp),%edi + jb .L056permute + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size AES_set_decrypt_key,.-.L_AES_set_decrypt_key_begin +.byte 65,69,83,32,102,111,114,32,120,56,54,44,32,67,82,89 +.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 +.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: +#endif diff --git a/sys/crypto/openssl/i386/aesni-x86.S b/sys/crypto/openssl/i386/aesni-x86.S index ea2058f2bad8a..9dbcba730f33f 100644 --- a/sys/crypto/openssl/i386/aesni-x86.S +++ b/sys/crypto/openssl/i386/aesni-x86.S @@ -6,6 +6,11 @@ .align 16 aesni_encrypt: .L_aesni_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -33,6 +38,11 @@ aesni_encrypt: .align 16 aesni_decrypt: .L_aesni_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -58,6 +68,11 @@ aesni_decrypt: .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -85,6 +100,11 @@ _aesni_encrypt2: .type _aesni_decrypt2,@function .align 16 _aesni_decrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -112,6 +132,11 @@ _aesni_decrypt2: .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -144,6 +169,11 @@ _aesni_encrypt3: .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -176,6 +206,11 @@ _aesni_decrypt3: .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -214,6 +249,11 @@ _aesni_encrypt4: .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -252,6 +292,11 @@ _aesni_decrypt4: .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -306,6 +351,11 @@ _aesni_encrypt6: .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -362,6 +412,11 @@ _aesni_decrypt6: .align 16 aesni_ecb_encrypt: .L_aesni_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -597,6 +652,11 @@ aesni_ecb_encrypt: .align 16 aesni_ccm64_encrypt_blocks: .L_aesni_ccm64_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -685,6 +745,11 @@ aesni_ccm64_encrypt_blocks: .align 16 aesni_ccm64_decrypt_blocks: .L_aesni_ccm64_decrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -808,6 +873,11 @@ aesni_ccm64_decrypt_blocks: .align 16 aesni_ctr32_encrypt_blocks: .L_aesni_ctr32_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1046,6 +1116,11 @@ aesni_ctr32_encrypt_blocks: .align 16 aesni_xts_encrypt: .L_aesni_xts_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1406,6 +1481,11 @@ aesni_xts_encrypt: .align 16 aesni_xts_decrypt: .L_aesni_xts_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1796,6 +1876,11 @@ aesni_xts_decrypt: .align 16 aesni_ocb_encrypt: .L_aesni_ocb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2191,6 +2276,11 @@ aesni_ocb_encrypt: .align 16 aesni_ocb_decrypt: .L_aesni_ocb_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2586,6 +2676,11 @@ aesni_ocb_decrypt: .align 16 aesni_cbc_encrypt: .L_aesni_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2845,6 +2940,11 @@ aesni_cbc_encrypt: .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx testl %eax,%eax @@ -3180,6 +3280,11 @@ _aesni_set_encrypt_key: .align 16 aesni_set_encrypt_key: .L_aesni_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -3191,6 +3296,11 @@ aesni_set_encrypt_key: .align 16 aesni_set_decrypt_key: .L_aesni_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -3237,6 +3347,23 @@ aesni_set_decrypt_key: .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl aesni_encrypt @@ -3244,6 +3371,11 @@ aesni_set_decrypt_key: .align 16 aesni_encrypt: .L_aesni_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -3271,6 +3403,11 @@ aesni_encrypt: .align 16 aesni_decrypt: .L_aesni_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 12(%esp),%edx movups (%eax),%xmm2 @@ -3296,6 +3433,11 @@ aesni_decrypt: .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3323,6 +3465,11 @@ _aesni_encrypt2: .type _aesni_decrypt2,@function .align 16 _aesni_decrypt2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3350,6 +3497,11 @@ _aesni_decrypt2: .type _aesni_encrypt3,@function .align 16 _aesni_encrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3382,6 +3534,11 @@ _aesni_encrypt3: .type _aesni_decrypt3,@function .align 16 _aesni_decrypt3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3414,6 +3571,11 @@ _aesni_decrypt3: .type _aesni_encrypt4,@function .align 16 _aesni_encrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -3452,6 +3614,11 @@ _aesni_encrypt4: .type _aesni_decrypt4,@function .align 16 _aesni_decrypt4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 movups 16(%edx),%xmm1 shll $4,%ecx @@ -3490,6 +3657,11 @@ _aesni_decrypt4: .type _aesni_encrypt6,@function .align 16 _aesni_encrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3544,6 +3716,11 @@ _aesni_encrypt6: .type _aesni_decrypt6,@function .align 16 _aesni_decrypt6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movups (%edx),%xmm0 shll $4,%ecx movups 16(%edx),%xmm1 @@ -3600,6 +3777,11 @@ _aesni_decrypt6: .align 16 aesni_ecb_encrypt: .L_aesni_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3835,6 +4017,11 @@ aesni_ecb_encrypt: .align 16 aesni_ccm64_encrypt_blocks: .L_aesni_ccm64_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3923,6 +4110,11 @@ aesni_ccm64_encrypt_blocks: .align 16 aesni_ccm64_decrypt_blocks: .L_aesni_ccm64_decrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4046,6 +4238,11 @@ aesni_ccm64_decrypt_blocks: .align 16 aesni_ctr32_encrypt_blocks: .L_aesni_ctr32_encrypt_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4284,6 +4481,11 @@ aesni_ctr32_encrypt_blocks: .align 16 aesni_xts_encrypt: .L_aesni_xts_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4644,6 +4846,11 @@ aesni_xts_encrypt: .align 16 aesni_xts_decrypt: .L_aesni_xts_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5034,6 +5241,11 @@ aesni_xts_decrypt: .align 16 aesni_ocb_encrypt: .L_aesni_ocb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5429,6 +5641,11 @@ aesni_ocb_encrypt: .align 16 aesni_ocb_decrypt: .L_aesni_ocb_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5824,6 +6041,11 @@ aesni_ocb_decrypt: .align 16 aesni_cbc_encrypt: .L_aesni_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -6083,6 +6305,11 @@ aesni_cbc_encrypt: .type _aesni_set_encrypt_key,@function .align 16 _aesni_set_encrypt_key: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx testl %eax,%eax @@ -6418,6 +6645,11 @@ _aesni_set_encrypt_key: .align 16 aesni_set_encrypt_key: .L_aesni_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -6429,6 +6661,11 @@ aesni_set_encrypt_key: .align 16 aesni_set_decrypt_key: .L_aesni_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%ecx movl 12(%esp),%edx @@ -6475,4 +6712,21 @@ aesni_set_decrypt_key: .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/bf-586.S b/sys/crypto/openssl/i386/bf-586.S index fd533bb993152..5f68ffad0f577 100644 --- a/sys/crypto/openssl/i386/bf-586.S +++ b/sys/crypto/openssl/i386/bf-586.S @@ -6,6 +6,11 @@ .align 16 BF_encrypt: .L_BF_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -356,6 +361,11 @@ BF_encrypt: .align 16 BF_decrypt: .L_BF_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -706,6 +716,11 @@ BF_decrypt: .align 16 BF_cbc_encrypt: .L_BF_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -769,21 +784,56 @@ BF_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L006ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L007ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L008ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L009ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L010ejend .L011ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L012ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L013ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L010ejend: xorl %ecx,%eax @@ -895,6 +945,23 @@ BF_cbc_encrypt: .long .L006ej7-.L004PIC_point .align 64 .size BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl BF_encrypt @@ -902,6 +969,11 @@ BF_cbc_encrypt: .align 16 BF_encrypt: .L_BF_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1252,6 +1324,11 @@ BF_encrypt: .align 16 BF_decrypt: .L_BF_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1602,6 +1679,11 @@ BF_decrypt: .align 16 BF_cbc_encrypt: .L_BF_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1665,21 +1747,56 @@ BF_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L006ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L007ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L008ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L009ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L010ejend .L011ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L012ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L013ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L010ejend: xorl %ecx,%eax @@ -1791,4 +1908,21 @@ BF_cbc_encrypt: .long .L006ej7-.L004PIC_point .align 64 .size BF_cbc_encrypt,.-.L_BF_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/bn-586.S b/sys/crypto/openssl/i386/bn-586.S index 512bcf3a82dbd..ebbea406881fb 100644 --- a/sys/crypto/openssl/i386/bn-586.S +++ b/sys/crypto/openssl/i386/bn-586.S @@ -6,6 +6,11 @@ .align 16 bn_mul_add_words: .L_bn_mul_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L000PIC_me_up .L000PIC_me_up: popl %eax @@ -289,6 +294,11 @@ bn_mul_add_words: .align 16 bn_mul_words: .L_bn_mul_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L010PIC_me_up .L010PIC_me_up: popl %eax @@ -471,6 +481,11 @@ bn_mul_words: .align 16 bn_sqr_words: .L_bn_sqr_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L017PIC_me_up .L017PIC_me_up: popl %eax @@ -612,6 +627,11 @@ bn_sqr_words: .align 16 bn_div_words: .L_bn_div_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax movl 12(%esp),%ecx @@ -623,6 +643,11 @@ bn_div_words: .align 16 bn_add_words: .L_bn_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -805,6 +830,11 @@ bn_add_words: .align 16 bn_sub_words: .L_bn_sub_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -987,6 +1017,11 @@ bn_sub_words: .align 16 bn_sub_part_words: .L_bn_sub_part_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1529,6 +1564,23 @@ bn_sub_part_words: ret .size bn_sub_part_words,.-.L_bn_sub_part_words_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl bn_mul_add_words @@ -1536,6 +1588,11 @@ bn_sub_part_words: .align 16 bn_mul_add_words: .L_bn_mul_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%eax btl $26,(%eax) jnc .L000maw_non_sse2 @@ -1816,6 +1873,11 @@ bn_mul_add_words: .align 16 bn_mul_words: .L_bn_mul_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%eax btl $26,(%eax) jnc .L009mw_non_sse2 @@ -1995,6 +2057,11 @@ bn_mul_words: .align 16 bn_sqr_words: .L_bn_sqr_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%eax btl $26,(%eax) jnc .L015sqr_non_sse2 @@ -2133,6 +2200,11 @@ bn_sqr_words: .align 16 bn_div_words: .L_bn_div_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax movl 12(%esp),%ecx @@ -2144,6 +2216,11 @@ bn_div_words: .align 16 bn_add_words: .L_bn_add_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2326,6 +2403,11 @@ bn_add_words: .align 16 bn_sub_words: .L_bn_sub_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2508,6 +2590,11 @@ bn_sub_words: .align 16 bn_sub_part_words: .L_bn_sub_part_words_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3050,4 +3137,21 @@ bn_sub_part_words: ret .size bn_sub_part_words,.-.L_bn_sub_part_words_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/cast-586.S b/sys/crypto/openssl/i386/cast-586.S index fea8b021a95f0..f016559e7fb42 100644 --- a/sys/crypto/openssl/i386/cast-586.S +++ b/sys/crypto/openssl/i386/cast-586.S @@ -6,6 +6,11 @@ .align 16 CAST_encrypt: .L_CAST_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -376,6 +381,11 @@ CAST_encrypt: .align 16 CAST_decrypt: .L_CAST_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -743,6 +753,11 @@ CAST_decrypt: .align 16 CAST_cbc_encrypt: .L_CAST_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -806,21 +821,56 @@ CAST_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L008ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L009ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L010ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L011ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L012ejend .L013ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L014ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L015ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L012ejend: xorl %ecx,%eax @@ -932,6 +982,23 @@ CAST_cbc_encrypt: .long .L008ej7-.L006PIC_point .align 64 .size CAST_cbc_encrypt,.-.L_CAST_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl CAST_encrypt @@ -939,6 +1006,11 @@ CAST_cbc_encrypt: .align 16 CAST_encrypt: .L_CAST_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1309,6 +1381,11 @@ CAST_encrypt: .align 16 CAST_decrypt: .L_CAST_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1676,6 +1753,11 @@ CAST_decrypt: .align 16 CAST_cbc_encrypt: .L_CAST_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1739,21 +1821,56 @@ CAST_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L008ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L009ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L010ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L011ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L012ejend .L013ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L014ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L015ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L012ejend: xorl %ecx,%eax @@ -1865,4 +1982,21 @@ CAST_cbc_encrypt: .long .L008ej7-.L006PIC_point .align 64 .size CAST_cbc_encrypt,.-.L_CAST_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/chacha-x86.S b/sys/crypto/openssl/i386/chacha-x86.S index a45c3e3a37e1f..df2b4c3f4f95c 100644 --- a/sys/crypto/openssl/i386/chacha-x86.S +++ b/sys/crypto/openssl/i386/chacha-x86.S @@ -6,6 +6,11 @@ .align 16 ChaCha20_ctr32: .L_ChaCha20_ctr32_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -379,6 +384,11 @@ ChaCha20_ctr32: .align 16 ChaCha20_ssse3: .L_ChaCha20_ssse3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -534,6 +544,11 @@ ChaCha20_ssse3: .align 16 ChaCha20_xop: .L_ChaCha20_xop_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1008,6 +1023,23 @@ ChaCha20_xop: ret .size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl ChaCha20_ctr32 @@ -1015,6 +1047,11 @@ ChaCha20_xop: .align 16 ChaCha20_ctr32: .L_ChaCha20_ctr32_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1388,6 +1425,11 @@ ChaCha20_ctr32: .align 16 ChaCha20_ssse3: .L_ChaCha20_ssse3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1543,6 +1585,11 @@ ChaCha20_ssse3: .align 16 ChaCha20_xop: .L_ChaCha20_xop_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2017,4 +2064,21 @@ ChaCha20_xop: ret .size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/cmll-x86.S b/sys/crypto/openssl/i386/cmll-x86.S index 3a49ea43c2308..f47261016e40e 100644 --- a/sys/crypto/openssl/i386/cmll-x86.S +++ b/sys/crypto/openssl/i386/cmll-x86.S @@ -6,6 +6,11 @@ .align 16 Camellia_EncryptBlock_Rounds: .L_Camellia_EncryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -60,6 +65,11 @@ Camellia_EncryptBlock_Rounds: .align 16 Camellia_EncryptBlock: .L_Camellia_EncryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -72,6 +82,11 @@ Camellia_EncryptBlock: .align 16 Camellia_encrypt: .L_Camellia_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -124,6 +139,11 @@ Camellia_encrypt: .type _x86_Camellia_encrypt,@function .align 16 _x86_Camellia_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -354,6 +374,11 @@ _x86_Camellia_encrypt: .align 16 Camellia_DecryptBlock_Rounds: .L_Camellia_DecryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -408,6 +433,11 @@ Camellia_DecryptBlock_Rounds: .align 16 Camellia_DecryptBlock: .L_Camellia_DecryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -420,6 +450,11 @@ Camellia_DecryptBlock: .align 16 Camellia_decrypt: .L_Camellia_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -472,6 +507,11 @@ Camellia_decrypt: .type _x86_Camellia_decrypt,@function .align 16 _x86_Camellia_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -702,6 +742,11 @@ _x86_Camellia_decrypt: .align 16 Camellia_Ekeygen: .L_Camellia_Ekeygen_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1543,6 +1588,11 @@ Camellia_Ekeygen: .align 16 Camellia_set_key: .L_Camellia_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ecx movl 12(%esp),%ebx @@ -2095,6 +2145,11 @@ Camellia_set_key: .align 16 Camellia_cbc_encrypt: .L_Camellia_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2374,6 +2429,23 @@ Camellia_cbc_encrypt: .byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl Camellia_EncryptBlock_Rounds @@ -2381,6 +2453,11 @@ Camellia_cbc_encrypt: .align 16 Camellia_EncryptBlock_Rounds: .L_Camellia_EncryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2435,6 +2512,11 @@ Camellia_EncryptBlock_Rounds: .align 16 Camellia_EncryptBlock: .L_Camellia_EncryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -2447,6 +2529,11 @@ Camellia_EncryptBlock: .align 16 Camellia_encrypt: .L_Camellia_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2499,6 +2586,11 @@ Camellia_encrypt: .type _x86_Camellia_encrypt,@function .align 16 _x86_Camellia_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -2729,6 +2821,11 @@ _x86_Camellia_encrypt: .align 16 Camellia_DecryptBlock_Rounds: .L_Camellia_DecryptBlock_Rounds_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2783,6 +2880,11 @@ Camellia_DecryptBlock_Rounds: .align 16 Camellia_DecryptBlock: .L_Camellia_DecryptBlock_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $128,%eax subl 4(%esp),%eax movl $3,%eax @@ -2795,6 +2897,11 @@ Camellia_DecryptBlock: .align 16 Camellia_decrypt: .L_Camellia_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2847,6 +2954,11 @@ Camellia_decrypt: .type _x86_Camellia_decrypt,@function .align 16 _x86_Camellia_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl (%edi),%eax xorl 4(%edi),%ebx xorl 8(%edi),%ecx @@ -3077,6 +3189,11 @@ _x86_Camellia_decrypt: .align 16 Camellia_Ekeygen: .L_Camellia_Ekeygen_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3918,6 +4035,11 @@ Camellia_Ekeygen: .align 16 Camellia_set_key: .L_Camellia_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ecx movl 12(%esp),%ebx @@ -4470,6 +4592,11 @@ Camellia_set_key: .align 16 Camellia_cbc_encrypt: .L_Camellia_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4749,4 +4876,21 @@ Camellia_cbc_encrypt: .byte 67,97,109,101,108,108,105,97,32,102,111,114,32,120,56,54 .byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 .byte 115,108,46,111,114,103,62,0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/co-586.S b/sys/crypto/openssl/i386/co-586.S index e3f734245def6..ed8f2d14f2eea 100644 --- a/sys/crypto/openssl/i386/co-586.S +++ b/sys/crypto/openssl/i386/co-586.S @@ -6,6 +6,11 @@ .align 16 bn_mul_comba8: .L_bn_mul_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -550,6 +555,11 @@ bn_mul_comba8: .align 16 bn_mul_comba4: .L_bn_mul_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -718,6 +728,11 @@ bn_mul_comba4: .align 16 bn_sqr_comba8: .L_bn_sqr_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -1126,6 +1141,11 @@ bn_sqr_comba8: .align 16 bn_sqr_comba4: .L_bn_sqr_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -1253,6 +1273,23 @@ bn_sqr_comba4: popl %esi ret .size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl bn_mul_comba8 @@ -1260,6 +1297,11 @@ bn_sqr_comba4: .align 16 bn_mul_comba8: .L_bn_mul_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -1804,6 +1846,11 @@ bn_mul_comba8: .align 16 bn_mul_comba4: .L_bn_mul_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi movl 12(%esp),%esi pushl %edi @@ -1972,6 +2019,11 @@ bn_mul_comba4: .align 16 bn_sqr_comba8: .L_bn_sqr_comba8_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -2380,6 +2432,11 @@ bn_sqr_comba8: .align 16 bn_sqr_comba4: .L_bn_sqr_comba4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi pushl %ebp @@ -2507,4 +2564,21 @@ bn_sqr_comba4: popl %esi ret .size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/crypt586.S b/sys/crypto/openssl/i386/crypt586.S index fd5e72fd8abe7..b86cc00e680c9 100644 --- a/sys/crypto/openssl/i386/crypt586.S +++ b/sys/crypto/openssl/i386/crypt586.S @@ -6,6 +6,11 @@ .align 16 fcrypt_body: .L_fcrypt_body_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -878,6 +883,23 @@ fcrypt_body: popl %ebp ret .size fcrypt_body,.-.L_fcrypt_body_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl fcrypt_body @@ -885,6 +907,11 @@ fcrypt_body: .align 16 fcrypt_body: .L_fcrypt_body_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1753,4 +1780,21 @@ fcrypt_body: popl %ebp ret .size fcrypt_body,.-.L_fcrypt_body_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/des-586.S b/sys/crypto/openssl/i386/des-586.S index fd9506d92c4c2..41c22a851e88a 100644 --- a/sys/crypto/openssl/i386/des-586.S +++ b/sys/crypto/openssl/i386/des-586.S @@ -5,6 +5,11 @@ .type _x86_DES_encrypt,@function .align 16 _x86_DES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl (%ecx),%eax @@ -476,6 +481,11 @@ _x86_DES_encrypt: .type _x86_DES_decrypt,@function .align 16 _x86_DES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl 120(%ecx),%eax @@ -949,6 +959,11 @@ _x86_DES_decrypt: .align 16 DES_encrypt1: .L_DES_encrypt1_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -1062,6 +1077,11 @@ DES_encrypt1: .align 16 DES_encrypt2: .L_DES_encrypt2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -1105,6 +1125,11 @@ DES_encrypt2: .align 16 DES_encrypt3: .L_DES_encrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -1226,6 +1251,11 @@ DES_encrypt3: .align 16 DES_decrypt3: .L_DES_decrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -1347,6 +1377,11 @@ DES_decrypt3: .align 16 DES_ncbc_encrypt: .L_DES_ncbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1408,21 +1443,56 @@ DES_ncbc_encrypt: xorl %edx,%edx jmp *%ebp .L012ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L013ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L014ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L015ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L016ejend .L017ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L018ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L019ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L016ejend: xorl %ecx,%eax @@ -1527,6 +1597,11 @@ DES_ncbc_encrypt: .align 16 DES_ede3_cbc_encrypt: .L_DES_ede3_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1592,21 +1667,56 @@ DES_ede3_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L036ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L037ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L038ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L039ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L040ejend .L041ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L042ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L043ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L040ejend: xorl %ecx,%eax @@ -1837,12 +1947,34 @@ DES_SPtrans: .long 8519680,131200,537002112,545259520 .long 128,545390592,8519808,0 .long 536870912,545259648,131072,8519808 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl DES_SPtrans .type _x86_DES_encrypt,@function .align 16 _x86_DES_encrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl (%ecx),%eax @@ -2314,6 +2446,11 @@ _x86_DES_encrypt: .type _x86_DES_decrypt,@function .align 16 _x86_DES_decrypt: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ecx movl 120(%ecx),%eax @@ -2787,6 +2924,11 @@ _x86_DES_decrypt: .align 16 DES_encrypt1: .L_DES_encrypt1_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -2900,6 +3042,11 @@ DES_encrypt1: .align 16 DES_encrypt2: .L_DES_encrypt2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi @@ -2943,6 +3090,11 @@ DES_encrypt2: .align 16 DES_encrypt3: .L_DES_encrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -3064,6 +3216,11 @@ DES_encrypt3: .align 16 DES_decrypt3: .L_DES_decrypt3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx movl 8(%esp),%ebx pushl %ebp @@ -3185,6 +3342,11 @@ DES_decrypt3: .align 16 DES_ncbc_encrypt: .L_DES_ncbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -3246,21 +3408,56 @@ DES_ncbc_encrypt: xorl %edx,%edx jmp *%ebp .L012ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L013ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L014ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L015ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L016ejend .L017ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L018ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L019ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L016ejend: xorl %ecx,%eax @@ -3365,6 +3562,11 @@ DES_ncbc_encrypt: .align 16 DES_ede3_cbc_encrypt: .L_DES_ede3_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -3430,21 +3632,56 @@ DES_ede3_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L036ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L037ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L038ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L039ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L040ejend .L041ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L042ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L043ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L040ejend: xorl %ecx,%eax @@ -3675,4 +3912,21 @@ DES_SPtrans: .long 8519680,131200,537002112,545259520 .long 128,545390592,8519808,0 .long 536870912,545259648,131072,8519808 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/e_padlock-x86.S b/sys/crypto/openssl/i386/e_padlock-x86.S index 66a152488e90f..164ebe3cdd80a 100644 --- a/sys/crypto/openssl/i386/e_padlock-x86.S +++ b/sys/crypto/openssl/i386/e_padlock-x86.S @@ -6,6 +6,11 @@ .align 16 padlock_capability: .L_padlock_capability_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx pushfl popl %eax @@ -66,6 +71,11 @@ padlock_capability: .align 16 padlock_key_bswap: .L_padlock_key_bswap_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 240(%edx),%ecx incl %ecx @@ -84,6 +94,11 @@ padlock_key_bswap: .align 16 padlock_verify_context: .L_padlock_verify_context_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx leal .Lpadlock_saved_context-.L004verify_pic_point,%eax pushfl @@ -95,6 +110,11 @@ padlock_verify_context: .type _padlock_verify_ctx,@function .align 16 _padlock_verify_ctx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%eax btl $30,4(%esp) jnc .L005verified @@ -111,6 +131,11 @@ _padlock_verify_ctx: .align 16 padlock_reload_key: .L_padlock_reload_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popfl ret @@ -120,6 +145,11 @@ padlock_reload_key: .align 16 padlock_aes_block: .L_padlock_aes_block_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi pushl %ebx @@ -140,6 +170,11 @@ padlock_aes_block: .align 16 padlock_ecb_encrypt: .L_padlock_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -319,6 +354,11 @@ padlock_ecb_encrypt: .align 16 padlock_cbc_encrypt: .L_padlock_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -502,6 +542,11 @@ padlock_cbc_encrypt: .align 16 padlock_cfb_encrypt: .L_padlock_cfb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -624,6 +669,11 @@ padlock_cfb_encrypt: .align 16 padlock_ofb_encrypt: .L_padlock_ofb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -746,6 +796,11 @@ padlock_ofb_encrypt: .align 16 padlock_ctr32_encrypt: .L_padlock_ctr32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -853,6 +908,11 @@ padlock_ctr32_encrypt: .align 16 padlock_xstore: .L_padlock_xstore_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi movl 8(%esp),%edi movl 12(%esp),%edx @@ -863,6 +923,11 @@ padlock_xstore: .type _win32_segv_handler,@function .align 16 _win32_segv_handler: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $1,%eax movl 4(%esp),%edx movl 12(%esp),%ecx @@ -878,6 +943,11 @@ _win32_segv_handler: .align 16 padlock_sha1_oneshot: .L_padlock_sha1_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -909,6 +979,11 @@ padlock_sha1_oneshot: .align 16 padlock_sha1_blocks: .L_padlock_sha1_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -939,6 +1014,11 @@ padlock_sha1_blocks: .align 16 padlock_sha256_oneshot: .L_padlock_sha256_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -970,6 +1050,11 @@ padlock_sha256_oneshot: .align 16 padlock_sha256_blocks: .L_padlock_sha256_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -1000,6 +1085,11 @@ padlock_sha256_blocks: .align 16 padlock_sha512_blocks: .L_padlock_sha512_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -1041,6 +1131,23 @@ padlock_sha512_blocks: .align 4 .Lpadlock_saved_context: .long 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl padlock_capability @@ -1048,6 +1155,11 @@ padlock_sha512_blocks: .align 16 padlock_capability: .L_padlock_capability_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebx pushfl popl %eax @@ -1108,6 +1220,11 @@ padlock_capability: .align 16 padlock_key_bswap: .L_padlock_key_bswap_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 240(%edx),%ecx incl %ecx @@ -1126,6 +1243,11 @@ padlock_key_bswap: .align 16 padlock_verify_context: .L_padlock_verify_context_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx leal .Lpadlock_saved_context-.L004verify_pic_point,%eax pushfl @@ -1137,6 +1259,11 @@ padlock_verify_context: .type _padlock_verify_ctx,@function .align 16 _padlock_verify_ctx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%eax btl $30,4(%esp) jnc .L005verified @@ -1153,6 +1280,11 @@ _padlock_verify_ctx: .align 16 padlock_reload_key: .L_padlock_reload_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popfl ret @@ -1162,6 +1294,11 @@ padlock_reload_key: .align 16 padlock_aes_block: .L_padlock_aes_block_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi pushl %ebx @@ -1182,6 +1319,11 @@ padlock_aes_block: .align 16 padlock_ecb_encrypt: .L_padlock_ecb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1361,6 +1503,11 @@ padlock_ecb_encrypt: .align 16 padlock_cbc_encrypt: .L_padlock_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1544,6 +1691,11 @@ padlock_cbc_encrypt: .align 16 padlock_cfb_encrypt: .L_padlock_cfb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1666,6 +1818,11 @@ padlock_cfb_encrypt: .align 16 padlock_ofb_encrypt: .L_padlock_ofb_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1788,6 +1945,11 @@ padlock_ofb_encrypt: .align 16 padlock_ctr32_encrypt: .L_padlock_ctr32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1895,6 +2057,11 @@ padlock_ctr32_encrypt: .align 16 padlock_xstore: .L_padlock_xstore_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi movl 8(%esp),%edi movl 12(%esp),%edx @@ -1905,6 +2072,11 @@ padlock_xstore: .type _win32_segv_handler,@function .align 16 _win32_segv_handler: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $1,%eax movl 4(%esp),%edx movl 12(%esp),%ecx @@ -1920,6 +2092,11 @@ _win32_segv_handler: .align 16 padlock_sha1_oneshot: .L_padlock_sha1_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -1951,6 +2128,11 @@ padlock_sha1_oneshot: .align 16 padlock_sha1_blocks: .L_padlock_sha1_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -1981,6 +2163,11 @@ padlock_sha1_blocks: .align 16 padlock_sha256_oneshot: .L_padlock_sha256_oneshot_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi xorl %eax,%eax @@ -2012,6 +2199,11 @@ padlock_sha256_oneshot: .align 16 padlock_sha256_blocks: .L_padlock_sha256_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -2042,6 +2234,11 @@ padlock_sha256_blocks: .align 16 padlock_sha512_blocks: .L_padlock_sha512_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %esi movl 12(%esp),%edi @@ -2083,4 +2280,21 @@ padlock_sha512_blocks: .align 4 .Lpadlock_saved_context: .long 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/ecp_nistz256-x86.S b/sys/crypto/openssl/i386/ecp_nistz256-x86.S index b72eda775c9db..d304619280c7a 100644 --- a/sys/crypto/openssl/i386/ecp_nistz256-x86.S +++ b/sys/crypto/openssl/i386/ecp_nistz256-x86.S @@ -2389,6 +2389,11 @@ ecp_nistz256_precomputed: .align 16 ecp_nistz256_mul_by_2: .L_ecp_nistz256_mul_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2408,6 +2413,11 @@ ecp_nistz256_mul_by_2: .align 16 ecp_nistz256_mul_by_3: .L_ecp_nistz256_mul_by_3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2433,6 +2443,11 @@ ecp_nistz256_mul_by_3: .align 16 ecp_nistz256_div_by_2: .L_ecp_nistz256_div_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2449,6 +2464,11 @@ ecp_nistz256_div_by_2: .type _ecp_nistz256_div_by_2,@function .align 16 _ecp_nistz256_div_by_2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ebp xorl %edx,%edx movl 4(%esi),%ebx @@ -2532,6 +2552,11 @@ _ecp_nistz256_div_by_2: .align 16 ecp_nistz256_add: .L_ecp_nistz256_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2549,6 +2574,11 @@ ecp_nistz256_add: .type _ecp_nistz256_add,@function .align 16 _ecp_nistz256_add: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -2626,6 +2656,11 @@ _ecp_nistz256_add: .align 16 ecp_nistz256_sub: .L_ecp_nistz256_sub_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2643,6 +2678,11 @@ ecp_nistz256_sub: .type _ecp_nistz256_sub,@function .align 16 _ecp_nistz256_sub: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -2701,6 +2741,11 @@ _ecp_nistz256_sub: .align 16 ecp_nistz256_neg: .L_ecp_nistz256_neg_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2729,6 +2774,11 @@ ecp_nistz256_neg: .type _picup_eax,@function .align 16 _picup_eax: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esp),%eax ret .size _picup_eax,.-_picup_eax @@ -2737,6 +2787,11 @@ _picup_eax: .align 16 ecp_nistz256_to_mont: .L_ecp_nistz256_to_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2760,6 +2815,11 @@ ecp_nistz256_to_mont: .align 16 ecp_nistz256_from_mont: .L_ecp_nistz256_from_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2783,6 +2843,11 @@ ecp_nistz256_from_mont: .align 16 ecp_nistz256_mul_mont: .L_ecp_nistz256_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2806,6 +2871,11 @@ ecp_nistz256_mul_mont: .align 16 ecp_nistz256_sqr_mont: .L_ecp_nistz256_sqr_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2827,6 +2897,11 @@ ecp_nistz256_sqr_mont: .type _ecp_nistz256_mul_mont,@function .align 16 _ecp_nistz256_mul_mont: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + andl $83886080,%eax cmpl $83886080,%eax jne .L004mul_mont_ialu @@ -3724,6 +3799,11 @@ _ecp_nistz256_mul_mont: .align 16 ecp_nistz256_scatter_w5: .L_ecp_nistz256_scatter_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3757,6 +3837,11 @@ ecp_nistz256_scatter_w5: .align 16 ecp_nistz256_gather_w5: .L_ecp_nistz256_gather_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3851,6 +3936,11 @@ ecp_nistz256_gather_w5: .align 16 ecp_nistz256_scatter_w7: .L_ecp_nistz256_scatter_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3882,6 +3972,11 @@ ecp_nistz256_scatter_w7: .align 16 ecp_nistz256_gather_w7: .L_ecp_nistz256_gather_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4096,6 +4191,11 @@ ecp_nistz256_gather_w7: .align 16 ecp_nistz256_point_double: .L_ecp_nistz256_point_double_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4222,6 +4322,11 @@ ecp_nistz256_point_double: .align 16 ecp_nistz256_point_add: .L_ecp_nistz256_point_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -4735,6 +4840,11 @@ ecp_nistz256_point_add: .align 16 ecp_nistz256_point_add_affine: .L_ecp_nistz256_point_add_affine_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5163,6 +5273,23 @@ ecp_nistz256_point_add_affine: ret .size ecp_nistz256_point_add_affine,.-.L_ecp_nistz256_point_add_affine_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl ecp_nistz256_precomputed @@ -7553,6 +7680,11 @@ ecp_nistz256_precomputed: .align 16 ecp_nistz256_mul_by_2: .L_ecp_nistz256_mul_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7572,6 +7704,11 @@ ecp_nistz256_mul_by_2: .align 16 ecp_nistz256_mul_by_3: .L_ecp_nistz256_mul_by_3_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7597,6 +7734,11 @@ ecp_nistz256_mul_by_3: .align 16 ecp_nistz256_div_by_2: .L_ecp_nistz256_div_by_2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7613,6 +7755,11 @@ ecp_nistz256_div_by_2: .type _ecp_nistz256_div_by_2,@function .align 16 _ecp_nistz256_div_by_2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ebp xorl %edx,%edx movl 4(%esi),%ebx @@ -7696,6 +7843,11 @@ _ecp_nistz256_div_by_2: .align 16 ecp_nistz256_add: .L_ecp_nistz256_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7713,6 +7865,11 @@ ecp_nistz256_add: .type _ecp_nistz256_add,@function .align 16 _ecp_nistz256_add: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -7790,6 +7947,11 @@ _ecp_nistz256_add: .align 16 ecp_nistz256_sub: .L_ecp_nistz256_sub_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7807,6 +7969,11 @@ ecp_nistz256_sub: .type _ecp_nistz256_sub,@function .align 16 _ecp_nistz256_sub: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -7865,6 +8032,11 @@ _ecp_nistz256_sub: .align 16 ecp_nistz256_neg: .L_ecp_nistz256_neg_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7893,6 +8065,11 @@ ecp_nistz256_neg: .type _picup_eax,@function .align 16 _picup_eax: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esp),%eax ret .size _picup_eax,.-_picup_eax @@ -7901,6 +8078,11 @@ _picup_eax: .align 16 ecp_nistz256_to_mont: .L_ecp_nistz256_to_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7924,6 +8106,11 @@ ecp_nistz256_to_mont: .align 16 ecp_nistz256_from_mont: .L_ecp_nistz256_from_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7947,6 +8134,11 @@ ecp_nistz256_from_mont: .align 16 ecp_nistz256_mul_mont: .L_ecp_nistz256_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7970,6 +8162,11 @@ ecp_nistz256_mul_mont: .align 16 ecp_nistz256_sqr_mont: .L_ecp_nistz256_sqr_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7991,6 +8188,11 @@ ecp_nistz256_sqr_mont: .type _ecp_nistz256_mul_mont,@function .align 16 _ecp_nistz256_mul_mont: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + andl $83886080,%eax cmpl $83886080,%eax jne .L004mul_mont_ialu @@ -8888,6 +9090,11 @@ _ecp_nistz256_mul_mont: .align 16 ecp_nistz256_scatter_w5: .L_ecp_nistz256_scatter_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -8921,6 +9128,11 @@ ecp_nistz256_scatter_w5: .align 16 ecp_nistz256_gather_w5: .L_ecp_nistz256_gather_w5_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9015,6 +9227,11 @@ ecp_nistz256_gather_w5: .align 16 ecp_nistz256_scatter_w7: .L_ecp_nistz256_scatter_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9046,6 +9263,11 @@ ecp_nistz256_scatter_w7: .align 16 ecp_nistz256_gather_w7: .L_ecp_nistz256_gather_w7_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9260,6 +9482,11 @@ ecp_nistz256_gather_w7: .align 16 ecp_nistz256_point_double: .L_ecp_nistz256_point_double_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9386,6 +9613,11 @@ ecp_nistz256_point_double: .align 16 ecp_nistz256_point_add: .L_ecp_nistz256_point_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -9899,6 +10131,11 @@ ecp_nistz256_point_add: .align 16 ecp_nistz256_point_add_affine: .L_ecp_nistz256_point_add_affine_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -10327,4 +10564,21 @@ ecp_nistz256_point_add_affine: ret .size ecp_nistz256_point_add_affine,.-.L_ecp_nistz256_point_add_affine_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/ghash-x86.S b/sys/crypto/openssl/i386/ghash-x86.S index 2d031cdd47e58..e855db4c1b556 100644 --- a/sys/crypto/openssl/i386/ghash-x86.S +++ b/sys/crypto/openssl/i386/ghash-x86.S @@ -6,6 +6,11 @@ .align 16 gcm_gmult_4bit_x86: .L_gcm_gmult_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -100,6 +105,11 @@ gcm_gmult_4bit_x86: .align 16 gcm_ghash_4bit_x86: .L_gcm_ghash_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -209,6 +219,11 @@ gcm_ghash_4bit_x86: .align 16 gcm_gmult_4bit_mmx: .L_gcm_gmult_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -308,6 +323,11 @@ gcm_gmult_4bit_mmx: .align 16 gcm_ghash_4bit_mmx: .L_gcm_ghash_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -912,6 +932,11 @@ gcm_ghash_4bit_mmx: .align 16 gcm_init_clmul: .L_gcm_init_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax call .L010pic @@ -981,6 +1006,11 @@ gcm_init_clmul: .align 16 gcm_gmult_clmul: .L_gcm_gmult_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%edx call .L011pic @@ -1034,6 +1064,11 @@ gcm_gmult_clmul: .align 16 gcm_ghash_clmul: .L_gcm_ghash_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1264,6 +1299,23 @@ gcm_ghash_clmul: .byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl gcm_gmult_4bit_x86 @@ -1271,6 +1323,11 @@ gcm_ghash_clmul: .align 16 gcm_gmult_4bit_x86: .L_gcm_gmult_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1365,6 +1422,11 @@ gcm_gmult_4bit_x86: .align 16 gcm_ghash_4bit_x86: .L_gcm_ghash_4bit_x86_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1474,6 +1536,11 @@ gcm_ghash_4bit_x86: .align 16 gcm_gmult_4bit_mmx: .L_gcm_gmult_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1573,6 +1640,11 @@ gcm_gmult_4bit_mmx: .align 16 gcm_ghash_4bit_mmx: .L_gcm_ghash_4bit_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2177,6 +2249,11 @@ gcm_ghash_4bit_mmx: .align 16 gcm_init_clmul: .L_gcm_init_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax call .L010pic @@ -2246,6 +2323,11 @@ gcm_init_clmul: .align 16 gcm_gmult_clmul: .L_gcm_gmult_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%eax movl 8(%esp),%edx call .L011pic @@ -2299,6 +2381,11 @@ gcm_gmult_clmul: .align 16 gcm_ghash_clmul: .L_gcm_ghash_clmul_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2529,4 +2616,21 @@ gcm_ghash_clmul: .byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 .byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 .byte 0 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/md5-586.S b/sys/crypto/openssl/i386/md5-586.S index 1b1a07ec96d19..2d249dd7230fc 100644 --- a/sys/crypto/openssl/i386/md5-586.S +++ b/sys/crypto/openssl/i386/md5-586.S @@ -1,11 +1,16 @@ /* Do not modify. This file is auto-generated from md5-586.pl. */ #ifdef PIC .text -.globl md5_block_asm_data_order -.type md5_block_asm_data_order,@function +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order,@function .align 16 -md5_block_asm_data_order: -.L_md5_block_asm_data_order_begin: +ossl_md5_block_asm_data_order: +.L_ossl_md5_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%edi @@ -677,14 +682,36 @@ md5_block_asm_data_order: popl %edi popl %esi ret -.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin +.size ossl_md5_block_asm_data_order,.-.L_ossl_md5_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text -.globl md5_block_asm_data_order -.type md5_block_asm_data_order,@function +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order,@function .align 16 -md5_block_asm_data_order: -.L_md5_block_asm_data_order_begin: +ossl_md5_block_asm_data_order: +.L_ossl_md5_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%edi @@ -1356,5 +1383,22 @@ md5_block_asm_data_order: popl %edi popl %esi ret -.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin +.size ossl_md5_block_asm_data_order,.-.L_ossl_md5_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/poly1305-x86.S b/sys/crypto/openssl/i386/poly1305-x86.S index f8a678b3dc310..38af67d4ffc98 100644 --- a/sys/crypto/openssl/i386/poly1305-x86.S +++ b/sys/crypto/openssl/i386/poly1305-x86.S @@ -7,6 +7,11 @@ .align 16 poly1305_init: .L_poly1305_init_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -68,6 +73,11 @@ poly1305_init: .align 16 poly1305_blocks: .L_poly1305_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -236,6 +246,11 @@ poly1305_blocks: .align 16 poly1305_emit: .L_poly1305_emit_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -295,6 +310,11 @@ poly1305_emit: .type _poly1305_init_sse2,@function .align 16 _poly1305_init_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -497,6 +517,11 @@ _poly1305_init_sse2: .type _poly1305_blocks_sse2,@function .align 16 _poly1305_blocks_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1258,6 +1283,11 @@ _poly1305_blocks_sse2: .type _poly1305_emit_sse2,@function .align 16 _poly1305_emit_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1351,6 +1381,11 @@ _poly1305_emit_sse2: .type _poly1305_init_avx2,@function .align 16 _poly1305_init_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + vmovdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -1522,6 +1557,11 @@ _poly1305_init_avx2: .type _poly1305_blocks_avx2,@function .align 16 _poly1305_blocks_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1910,6 +1950,23 @@ _poly1305_blocks_avx2: .byte 114,103,62,0 .align 4 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .align 64 @@ -1918,6 +1975,11 @@ _poly1305_blocks_avx2: .align 16 poly1305_init: .L_poly1305_init_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1979,6 +2041,11 @@ poly1305_init: .align 16 poly1305_blocks: .L_poly1305_blocks_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2147,6 +2214,11 @@ poly1305_blocks: .align 16 poly1305_emit: .L_poly1305_emit_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2206,6 +2278,11 @@ poly1305_emit: .type _poly1305_init_sse2,@function .align 16 _poly1305_init_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -2408,6 +2485,11 @@ _poly1305_init_sse2: .type _poly1305_blocks_sse2,@function .align 16 _poly1305_blocks_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3169,6 +3251,11 @@ _poly1305_blocks_sse2: .type _poly1305_emit_sse2,@function .align 16 _poly1305_emit_sse2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3262,6 +3349,11 @@ _poly1305_emit_sse2: .type _poly1305_init_avx2,@function .align 16 _poly1305_init_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + vmovdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp @@ -3433,6 +3525,11 @@ _poly1305_init_avx2: .type _poly1305_blocks_avx2,@function .align 16 _poly1305_blocks_avx2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3821,4 +3918,21 @@ _poly1305_blocks_avx2: .byte 114,103,62,0 .align 4 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/rc4-586.S b/sys/crypto/openssl/i386/rc4-586.S index 1d076f3d6b101..98f5cacec9a69 100644 --- a/sys/crypto/openssl/i386/rc4-586.S +++ b/sys/crypto/openssl/i386/rc4-586.S @@ -6,6 +6,11 @@ .align 16 RC4: .L_RC4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -272,6 +277,11 @@ RC4: .align 16 RC4_set_key: .L_RC4_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -350,6 +360,11 @@ RC4_set_key: .align 16 RC4_options: .L_RC4_options_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L018pic_point .L018pic_point: popl %eax @@ -380,6 +395,23 @@ RC4_options: .align 64 .size RC4_options,.-.L_RC4_options_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl RC4 @@ -387,6 +419,11 @@ RC4_options: .align 16 RC4: .L_RC4_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -650,6 +687,11 @@ RC4: .align 16 RC4_set_key: .L_RC4_set_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -725,6 +767,11 @@ RC4_set_key: .align 16 RC4_options: .L_RC4_options_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L016pic_point .L016pic_point: popl %eax @@ -752,4 +799,21 @@ RC4_options: .align 64 .size RC4_options,.-.L_RC4_options_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/rc5-586.S b/sys/crypto/openssl/i386/rc5-586.S index 5ab6bea14b3a8..72db8f41c442f 100644 --- a/sys/crypto/openssl/i386/rc5-586.S +++ b/sys/crypto/openssl/i386/rc5-586.S @@ -6,6 +6,11 @@ .align 16 RC5_32_encrypt: .L_RC5_32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -197,6 +202,11 @@ RC5_32_encrypt: .align 16 RC5_32_decrypt: .L_RC5_32_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -390,6 +400,11 @@ RC5_32_decrypt: .align 16 RC5_32_cbc_encrypt: .L_RC5_32_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -449,21 +464,56 @@ RC5_32_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L010ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L011ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L012ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L013ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L014ejend .L015ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L016ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L017ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L014ejend: xorl %ecx,%eax @@ -563,6 +613,23 @@ RC5_32_cbc_encrypt: .long .L010ej7-.L008PIC_point .align 64 .size RC5_32_cbc_encrypt,.-.L_RC5_32_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl RC5_32_encrypt @@ -570,6 +637,11 @@ RC5_32_cbc_encrypt: .align 16 RC5_32_encrypt: .L_RC5_32_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -761,6 +833,11 @@ RC5_32_encrypt: .align 16 RC5_32_decrypt: .L_RC5_32_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %esi @@ -954,6 +1031,11 @@ RC5_32_decrypt: .align 16 RC5_32_cbc_encrypt: .L_RC5_32_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx @@ -1013,21 +1095,56 @@ RC5_32_cbc_encrypt: xorl %edx,%edx jmp *%ebp .L010ej7: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 6(%esi),%dh shll $8,%edx .L011ej6: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 5(%esi),%dh .L012ej5: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 4(%esi),%dl .L013ej4: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl (%esi),%ecx jmp .L014ejend .L015ej3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 2(%esi),%ch shll $8,%ecx .L016ej2: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb 1(%esi),%ch .L017ej1: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movb (%esi),%cl .L014ejend: xorl %ecx,%eax @@ -1127,4 +1244,21 @@ RC5_32_cbc_encrypt: .long .L010ej7-.L008PIC_point .align 64 .size RC5_32_cbc_encrypt,.-.L_RC5_32_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/rmd-586.S b/sys/crypto/openssl/i386/rmd-586.S index 5b6be50568b5e..20f64bb75cc50 100644 --- a/sys/crypto/openssl/i386/rmd-586.S +++ b/sys/crypto/openssl/i386/rmd-586.S @@ -6,6 +6,11 @@ .align 16 ripemd160_block_asm_data_order: .L_ripemd160_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax pushl %esi @@ -1964,6 +1969,23 @@ ripemd160_block_asm_data_order: popl %esi ret .size ripemd160_block_asm_data_order,.-.L_ripemd160_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl ripemd160_block_asm_data_order @@ -1971,6 +1993,11 @@ ripemd160_block_asm_data_order: .align 16 ripemd160_block_asm_data_order: .L_ripemd160_block_asm_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%eax pushl %esi @@ -3929,4 +3956,21 @@ ripemd160_block_asm_data_order: popl %esi ret .size ripemd160_block_asm_data_order,.-.L_ripemd160_block_asm_data_order_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/sha1-586.S b/sys/crypto/openssl/i386/sha1-586.S index 1b6b1a420f52b..19e598226a1de 100644 --- a/sys/crypto/openssl/i386/sha1-586.S +++ b/sys/crypto/openssl/i386/sha1-586.S @@ -6,6 +6,11 @@ .align 16 sha1_block_data_order: .L_sha1_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1400,6 +1405,11 @@ sha1_block_data_order: .type _sha1_block_data_order_shaext,@function .align 16 _sha1_block_data_order_shaext: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1570,6 +1580,11 @@ _sha1_block_data_order_shaext: .type _sha1_block_data_order_ssse3,@function .align 16 _sha1_block_data_order_ssse3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2789,6 +2804,11 @@ _sha1_block_data_order_ssse3: .type _sha1_block_data_order_avx,@function .align 16 _sha1_block_data_order_avx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -3969,6 +3989,23 @@ _sha1_block_data_order_avx: .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl sha1_block_data_order @@ -3976,6 +4013,11 @@ _sha1_block_data_order_avx: .align 16 sha1_block_data_order: .L_sha1_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5370,6 +5412,11 @@ sha1_block_data_order: .type _sha1_block_data_order_shaext,@function .align 16 _sha1_block_data_order_shaext: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5540,6 +5587,11 @@ _sha1_block_data_order_shaext: .type _sha1_block_data_order_ssse3,@function .align 16 _sha1_block_data_order_ssse3: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -6759,6 +6811,11 @@ _sha1_block_data_order_ssse3: .type _sha1_block_data_order_avx,@function .align 16 _sha1_block_data_order_avx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -7939,4 +7996,21 @@ _sha1_block_data_order_avx: .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/sha256-586.S b/sys/crypto/openssl/i386/sha256-586.S index 454030c18cc67..f0ed09ebf84a8 100644 --- a/sys/crypto/openssl/i386/sha256-586.S +++ b/sys/crypto/openssl/i386/sha256-586.S @@ -6,6 +6,11 @@ .align 16 sha256_block_data_order: .L_sha256_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -6782,6 +6787,23 @@ sha256_block_data_order: ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl sha256_block_data_order @@ -6789,6 +6811,11 @@ sha256_block_data_order: .align 16 sha256_block_data_order: .L_sha256_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -13565,4 +13592,21 @@ sha256_block_data_order: ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/sha512-586.S b/sys/crypto/openssl/i386/sha512-586.S index c5f6b718d56b7..a7385b55e0acf 100644 --- a/sys/crypto/openssl/i386/sha512-586.S +++ b/sys/crypto/openssl/i386/sha512-586.S @@ -6,6 +6,11 @@ .align 16 sha512_block_data_order: .L_sha512_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2828,6 +2833,23 @@ sha512_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl sha512_block_data_order @@ -2835,6 +2857,11 @@ sha512_block_data_order: .align 16 sha512_block_data_order: .L_sha512_block_data_order_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -5657,4 +5684,21 @@ sha512_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/vpaes-x86.S b/sys/crypto/openssl/i386/vpaes-x86.S index 1c9cef1456a07..0d479de7c30de 100644 --- a/sys/crypto/openssl/i386/vpaes-x86.S +++ b/sys/crypto/openssl/i386/vpaes-x86.S @@ -61,6 +61,11 @@ .type _vpaes_preheat,@function .align 16 _vpaes_preheat: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqa -48(%ebp),%xmm7 movdqa -16(%ebp),%xmm6 @@ -69,6 +74,11 @@ _vpaes_preheat: .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $16,%ecx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -146,6 +156,11 @@ _vpaes_encrypt_core: .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal 608(%ebp),%ebx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -234,6 +249,11 @@ _vpaes_decrypt_core: .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqu (%esi),%xmm0 movdqa 320(%ebp),%xmm2 @@ -328,6 +348,11 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -340,6 +365,11 @@ _vpaes_schedule_192_smear: .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa 8(%esp),%xmm2 pxor %xmm1,%xmm1 .byte 102,15,58,15,202,15 @@ -389,6 +419,11 @@ _vpaes_schedule_round: .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa -16(%ebp),%xmm2 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 @@ -404,6 +439,11 @@ _vpaes_schedule_transform: .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa %xmm0,%xmm4 movdqa 128(%ebp),%xmm5 testl %edi,%edi @@ -465,6 +505,11 @@ _vpaes_schedule_mangle: .align 16 vpaes_set_encrypt_key: .L_vpaes_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -498,6 +543,11 @@ vpaes_set_encrypt_key: .align 16 vpaes_set_decrypt_key: .L_vpaes_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -536,6 +586,11 @@ vpaes_set_decrypt_key: .align 16 vpaes_encrypt: .L_vpaes_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -565,6 +620,11 @@ vpaes_encrypt: .align 16 vpaes_decrypt: .L_vpaes_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -594,6 +654,11 @@ vpaes_decrypt: .align 16 vpaes_cbc_encrypt: .L_vpaes_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -660,6 +725,23 @@ vpaes_cbc_encrypt: popl %ebp ret .size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .align 64 @@ -722,6 +804,11 @@ vpaes_cbc_encrypt: .type _vpaes_preheat,@function .align 16 _vpaes_preheat: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqa -48(%ebp),%xmm7 movdqa -16(%ebp),%xmm6 @@ -730,6 +817,11 @@ _vpaes_preheat: .type _vpaes_encrypt_core,@function .align 16 _vpaes_encrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl $16,%ecx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -807,6 +899,11 @@ _vpaes_encrypt_core: .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal 608(%ebp),%ebx movl 240(%edx),%eax movdqa %xmm6,%xmm1 @@ -895,6 +992,11 @@ _vpaes_decrypt_core: .type _vpaes_schedule_core,@function .align 16 _vpaes_schedule_core: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + addl (%esp),%ebp movdqu (%esi),%xmm0 movdqa 320(%ebp),%xmm2 @@ -989,6 +1091,11 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pshufd $128,%xmm6,%xmm1 pshufd $254,%xmm7,%xmm0 pxor %xmm1,%xmm6 @@ -1001,6 +1108,11 @@ _vpaes_schedule_192_smear: .type _vpaes_schedule_round,@function .align 16 _vpaes_schedule_round: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa 8(%esp),%xmm2 pxor %xmm1,%xmm1 .byte 102,15,58,15,202,15 @@ -1050,6 +1162,11 @@ _vpaes_schedule_round: .type _vpaes_schedule_transform,@function .align 16 _vpaes_schedule_transform: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa -16(%ebp),%xmm2 movdqa %xmm2,%xmm1 pandn %xmm0,%xmm1 @@ -1065,6 +1182,11 @@ _vpaes_schedule_transform: .type _vpaes_schedule_mangle,@function .align 16 _vpaes_schedule_mangle: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movdqa %xmm0,%xmm4 movdqa 128(%ebp),%xmm5 testl %edi,%edi @@ -1126,6 +1248,11 @@ _vpaes_schedule_mangle: .align 16 vpaes_set_encrypt_key: .L_vpaes_set_encrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1159,6 +1286,11 @@ vpaes_set_encrypt_key: .align 16 vpaes_set_decrypt_key: .L_vpaes_set_decrypt_key_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1197,6 +1329,11 @@ vpaes_set_decrypt_key: .align 16 vpaes_encrypt: .L_vpaes_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1226,6 +1363,11 @@ vpaes_encrypt: .align 16 vpaes_decrypt: .L_vpaes_decrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1255,6 +1397,11 @@ vpaes_decrypt: .align 16 vpaes_cbc_encrypt: .L_vpaes_cbc_encrypt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1321,4 +1468,21 @@ vpaes_cbc_encrypt: popl %ebp ret .size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/wp-mmx.S b/sys/crypto/openssl/i386/wp-mmx.S index fc5df6274a9ca..407554c26515a 100644 --- a/sys/crypto/openssl/i386/wp-mmx.S +++ b/sys/crypto/openssl/i386/wp-mmx.S @@ -6,6 +6,11 @@ .align 16 whirlpool_block_mmx: .L_whirlpool_block_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -1106,6 +1111,23 @@ whirlpool_block_mmx: .byte 251,238,124,102,221,23,71,158 .byte 202,45,191,7,173,90,131,51 .size whirlpool_block_mmx,.-.L_whirlpool_block_mmx_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl whirlpool_block_mmx @@ -1113,6 +1135,11 @@ whirlpool_block_mmx: .align 16 whirlpool_block_mmx: .L_whirlpool_block_mmx_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -2213,4 +2240,21 @@ whirlpool_block_mmx: .byte 251,238,124,102,221,23,71,158 .byte 202,45,191,7,173,90,131,51 .size whirlpool_block_mmx,.-.L_whirlpool_block_mmx_begin + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/x86-gf2m.S b/sys/crypto/openssl/i386/x86-gf2m.S index 9c0e251781049..093988d3d5e2d 100644 --- a/sys/crypto/openssl/i386/x86-gf2m.S +++ b/sys/crypto/openssl/i386/x86-gf2m.S @@ -4,6 +4,11 @@ .type _mul_1x1_mmx,@function .align 16 _mul_1x1_mmx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -107,6 +112,11 @@ _mul_1x1_mmx: .type _mul_1x1_ialu,@function .align 16 _mul_1x1_ialu: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -241,6 +251,11 @@ _mul_1x1_ialu: .align 16 bn_GF2m_mul_2x2: .L_bn_GF2m_mul_2x2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L000PIC_me_up .L000PIC_me_up: popl %edx @@ -345,11 +360,33 @@ bn_GF2m_mul_2x2: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .type _mul_1x1_mmx,@function .align 16 _mul_1x1_mmx: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -453,6 +490,11 @@ _mul_1x1_mmx: .type _mul_1x1_ialu,@function .align 16 _mul_1x1_ialu: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + subl $36,%esp movl %eax,%ecx leal (%eax,%eax,1),%edx @@ -587,6 +629,11 @@ _mul_1x1_ialu: .align 16 bn_GF2m_mul_2x2: .L_bn_GF2m_mul_2x2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%edx movl (%edx),%eax movl 4(%edx),%edx @@ -688,4 +735,21 @@ bn_GF2m_mul_2x2: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/x86-mont.S b/sys/crypto/openssl/i386/x86-mont.S index 049c9e626e06f..4a5309b3e0db9 100644 --- a/sys/crypto/openssl/i386/x86-mont.S +++ b/sys/crypto/openssl/i386/x86-mont.S @@ -6,6 +6,11 @@ .align 16 bn_mul_mont: .L_bn_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -475,6 +480,23 @@ bn_mul_mont: .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl bn_mul_mont @@ -482,6 +504,11 @@ bn_mul_mont: .align 16 bn_mul_mont: .L_bn_mul_mont_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -948,4 +975,21 @@ bn_mul_mont: .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/i386/x86cpuid.S b/sys/crypto/openssl/i386/x86cpuid.S index d232734b64baa..2a7f7188b65fc 100644 --- a/sys/crypto/openssl/i386/x86cpuid.S +++ b/sys/crypto/openssl/i386/x86cpuid.S @@ -6,6 +6,11 @@ .align 16 OPENSSL_ia32_cpuid: .L_OPENSSL_ia32_cpuid_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -150,6 +155,11 @@ OPENSSL_ia32_cpuid: .align 16 OPENSSL_rdtsc: .L_OPENSSL_rdtsc_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx call .L009PIC_me_up @@ -167,6 +177,11 @@ OPENSSL_rdtsc: .align 16 OPENSSL_instrument_halt: .L_OPENSSL_instrument_halt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + call .L011PIC_me_up .L011PIC_me_up: popl %ecx @@ -199,6 +214,11 @@ OPENSSL_instrument_halt: .align 16 OPENSSL_far_spin: .L_OPENSSL_far_spin_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popl %eax btl $9,%eax @@ -226,6 +246,11 @@ OPENSSL_far_spin: .align 16 OPENSSL_wipe_cpu: .L_OPENSSL_wipe_cpu_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx call .L015PIC_me_up @@ -257,6 +282,11 @@ OPENSSL_wipe_cpu: .align 16 OPENSSL_atomic_add: .L_OPENSSL_atomic_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx pushl %ebx @@ -276,6 +306,11 @@ OPENSSL_atomic_add: .align 16 OPENSSL_cleanse: .L_OPENSSL_cleanse_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax @@ -313,6 +348,11 @@ OPENSSL_cleanse: .align 16 CRYPTO_memcmp: .L_CRYPTO_memcmp_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%esi @@ -342,6 +382,11 @@ CRYPTO_memcmp: .align 16 OPENSSL_instrument_bus: .L_OPENSSL_instrument_bus_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -390,6 +435,11 @@ OPENSSL_instrument_bus: .align 16 OPENSSL_instrument_bus2: .L_OPENSSL_instrument_bus2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -451,6 +501,11 @@ OPENSSL_instrument_bus2: .align 16 OPENSSL_ia32_rdrand_bytes: .L_OPENSSL_ia32_rdrand_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -494,6 +549,11 @@ OPENSSL_ia32_rdrand_bytes: .align 16 OPENSSL_ia32_rdseed_bytes: .L_OPENSSL_ia32_rdseed_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -537,6 +597,23 @@ OPENSSL_ia32_rdseed_bytes: .comm OPENSSL_ia32cap_P,16,4 .section .init call OPENSSL_cpuid_setup + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #else .text .globl OPENSSL_ia32_cpuid @@ -544,6 +621,11 @@ OPENSSL_ia32_rdseed_bytes: .align 16 OPENSSL_ia32_cpuid: .L_OPENSSL_ia32_cpuid_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -688,6 +770,11 @@ OPENSSL_ia32_cpuid: .align 16 OPENSSL_rdtsc: .L_OPENSSL_rdtsc_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx leal OPENSSL_ia32cap_P,%ecx @@ -702,6 +789,11 @@ OPENSSL_rdtsc: .align 16 OPENSSL_instrument_halt: .L_OPENSSL_instrument_halt_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + leal OPENSSL_ia32cap_P,%ecx btl $4,(%ecx) jnc .L010nohalt @@ -731,6 +823,11 @@ OPENSSL_instrument_halt: .align 16 OPENSSL_far_spin: .L_OPENSSL_far_spin_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushfl popl %eax btl $9,%eax @@ -758,6 +855,11 @@ OPENSSL_far_spin: .align 16 OPENSSL_wipe_cpu: .L_OPENSSL_wipe_cpu_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + xorl %eax,%eax xorl %edx,%edx leal OPENSSL_ia32cap_P,%ecx @@ -786,6 +888,11 @@ OPENSSL_wipe_cpu: .align 16 OPENSSL_atomic_add: .L_OPENSSL_atomic_add_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx pushl %ebx @@ -805,6 +912,11 @@ OPENSSL_atomic_add: .align 16 OPENSSL_cleanse: .L_OPENSSL_cleanse_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + movl 4(%esp),%edx movl 8(%esp),%ecx xorl %eax,%eax @@ -842,6 +954,11 @@ OPENSSL_cleanse: .align 16 CRYPTO_memcmp: .L_CRYPTO_memcmp_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %esi pushl %edi movl 12(%esp),%esi @@ -871,6 +988,11 @@ CRYPTO_memcmp: .align 16 OPENSSL_instrument_bus: .L_OPENSSL_instrument_bus_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -916,6 +1038,11 @@ OPENSSL_instrument_bus: .align 16 OPENSSL_instrument_bus2: .L_OPENSSL_instrument_bus2_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %ebp pushl %ebx pushl %esi @@ -974,6 +1101,11 @@ OPENSSL_instrument_bus2: .align 16 OPENSSL_ia32_rdrand_bytes: .L_OPENSSL_ia32_rdrand_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -1017,6 +1149,11 @@ OPENSSL_ia32_rdrand_bytes: .align 16 OPENSSL_ia32_rdseed_bytes: .L_OPENSSL_ia32_rdseed_bytes_begin: + #ifdef __CET__ + +.byte 243,15,30,251 + #endif + pushl %edi pushl %ebx xorl %eax,%eax @@ -1060,4 +1197,21 @@ OPENSSL_ia32_rdseed_bytes: .comm OPENSSL_ia32cap_P,16,4 .section .init call OPENSSL_cpuid_setup + + .section ".note.gnu.property", "a" + .p2align 2 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + .asciz "GNU" +1: + .p2align 2 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 2 +4: #endif diff --git a/sys/crypto/openssl/powerpc/bn-ppc.S b/sys/crypto/openssl/powerpc/bn-ppc.S new file mode 100644 index 0000000000000..166e2de1dddb2 --- /dev/null +++ b/sys/crypto/openssl/powerpc/bn-ppc.S @@ -0,0 +1,1855 @@ +/* Do not modify. This file is auto-generated from ppc.pl. */ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.globl bn_sub_words +.type bn_sub_words,@function +.globl bn_add_words +.type bn_add_words,@function +.globl bn_div_words +.type bn_div_words,@function +.globl bn_sqr_words +.type bn_sqr_words,@function +.globl bn_mul_words +.type bn_mul_words,@function +.globl bn_mul_add_words +.type bn_mul_add_words,@function + + + +.machine "any" +.text + + + + + + + + +.align 4 +bn_sqr_comba4: + + + + + + + + + + + + + + + + xor 0,0,0 + + + + lwz 5,0(4) + mullw 9,5,5 + mulhwu 10,5,5 + + + + + stw 9,0(3) + + lwz 6,4(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 9,0 + + + addc 10,7,10 + addze 11,8 + addze 9,9 + + stw 10,4(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,8(3) + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,4(4) + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,12(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,16(3) + + lwz 5,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,20(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 9,7,9 + adde 10,8,10 + + stw 9,24(3) + stw 10,28(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba4,.-bn_sqr_comba4 + + + + + + + + +.align 4 +bn_sqr_comba8: + + + + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + lwz 5,0(4) + mullw 9,5,5 + mulhwu 10,5,5 + stw 9,0(3) + + lwz 6,4(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,0 + addze 9,0 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + stw 10,4(3) + + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + stw 11,8(3) + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,4(4) + lwz 6,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + stw 9,12(3) + + mullw 7,6,6 + mulhwu 8,6,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,0(4) + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,16(3) + + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,4(4) + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,8(4) + lwz 6,12(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,20(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,4(4) + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,0(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,24(3) + + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,4(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,8(4) + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,12(4) + lwz 6,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,28(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,8(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,4(4) + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,32(3) + + lwz 5,8(4) + mullw 7,5,6 + mulhwu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,12(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + lwz 5,16(4) + lwz 6,20(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,36(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + lwz 5,12(4) + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,40(3) + + lwz 5,16(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + lwz 5,20(4) + lwz 6,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + stw 11,44(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + lwz 6,28(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + stw 9,48(3) + + + lwz 5,24(4) + mullw 7,5,6 + mulhwu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + stw 10,52(3) + + mullw 7,6,6 + mulhwu 8,6,6 + addc 11,7,11 + adde 9,8,9 + stw 11,56(3) + stw 9, 60(3) + + + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba8,.-bn_sqr_comba8 + + + + + + + + +.align 4 +bn_mul_comba4: + + + + + + + + + + + + xor 0,0,0 + + lwz 6,0(4) + lwz 7,0(5) + mullw 10,6,7 + mulhwu 11,6,7 + stw 10,0(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,0 + addze 10,0 + + lwz 6, 4(4) + lwz 7, 0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + stw 11,4(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + lwz 6,4(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + + lwz 6,0(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + stw 12,8(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,0 + + lwz 6,4(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + lwz 6,8(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + lwz 6,12(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + stw 10,12(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,0 + + lwz 6,8(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + + lwz 6,4(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + stw 11,16(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + lwz 6,12(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + stw 12,20(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,8,10 + adde 11,9,11 + + stw 10,24(3) + stw 11,28(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba4,.-bn_mul_comba4 + + + + + + + + +.align 4 +bn_mul_comba8: + + + + + + + + + + + + xor 0,0,0 + + + lwz 6,0(4) + lwz 7,0(5) + mullw 10,6,7 + mulhwu 11,6,7 + stw 10,0(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + addze 12,9 + addze 10,0 + + lwz 6,4(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,4(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,4(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,0(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,8(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,4(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + + lwz 6,8(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,12(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,12(3) + + lwz 6,16(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,12(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,8(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,4(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,0(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,16(3) + + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,4(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,8(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,12(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,16(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,20(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,20(3) + + lwz 6,24(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,20(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,16(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,12(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,8(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,4(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,0(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,24(3) + + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,4(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,8(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,12(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,16(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,20(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,24(4) + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,28(4) + lwz 7,0(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,28(3) + + lwz 7,4(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,24(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,20(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,16(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,12(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,8(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,4(4) + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,32(3) + + lwz 6,8(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,12(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,16(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,20(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,24(4) + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,28(4) + lwz 7,8(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,36(3) + + lwz 7,12(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,24(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,20(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,16(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + lwz 6,12(4) + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,40(3) + + lwz 6,16(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + lwz 6,20(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,24(4) + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + lwz 6,28(4) + lwz 7,16(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + stw 12,44(3) + + lwz 7,20(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + lwz 6,24(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + lwz 6,20(4) + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + stw 10,48(3) + + lwz 6,24(4) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + lwz 6,28(4) + lwz 7,24(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + stw 11,52(3) + + lwz 7,28(5) + mullw 8,6,7 + mulhwu 9,6,7 + addc 12,12,8 + adde 10,10,9 + stw 12,56(3) + stw 10,60(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba8,.-bn_mul_comba8 + + + + + + + + +.align 4 +bn_sub_words: + + + + + + + + + + + + + + xor 0,0,0 + + + + subfc. 7,0,6 + + + beq .Lppcasm_sub_adios + addi 4,4,-4 + addi 3,3,-4 + addi 5,5,-4 + mtctr 6 +.Lppcasm_sub_mainloop: + lwzu 7,4(4) + lwzu 8,4(5) + subfe 6,8,7 + + + stwu 6,4(3) + bdnz .Lppcasm_sub_mainloop +.Lppcasm_sub_adios: + subfze 3,0 + andi. 3,3,1 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_sub_words,.-bn_sub_words + + + + + + + + +.align 4 +bn_add_words: + + + + + + + + + + + + + + xor 0,0,0 + + + + addic. 6,6,0 + beq .Lppcasm_add_adios + addi 4,4,-4 + addi 3,3,-4 + addi 5,5,-4 + mtctr 6 +.Lppcasm_add_mainloop: + lwzu 7,4(4) + lwzu 8,4(5) + adde 8,7,8 + stwu 8,4(3) + bdnz .Lppcasm_add_mainloop +.Lppcasm_add_adios: + addze 3,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_add_words,.-bn_add_words + + + + + + + + +.align 4 +bn_div_words: + + + + + + + + + + + + cmplwi 0,5,0 + bne .Lppcasm_div1 + li 3,-1 + blr +.Lppcasm_div1: + xor 0,0,0 + li 8,32 + cntlzw. 7,5 + beq .Lppcasm_div2 + subf 8,7,8 + srw. 9,3,8 + tw 16,9,0 +.Lppcasm_div2: + .long 0x7c032840 + blt .Lppcasm_div3 + subf 3,5,3 +.Lppcasm_div3: + cmpi 0,0,7,0 + beq .Lppcasm_div4 + slw 3,3,7 + srw 8,4,8 + slw 5,5,7 + or 3,3,8 + slw 4,4,7 +.Lppcasm_div4: + srwi 9,5,16 + + + li 6,2 + mtctr 6 +.Lppcasm_divouterloop: + srwi 8,3,16 + srwi 11,4,16 + + .long 0x7c084840 + bne .Lppcasm_div5 + + li 8,-1 + clrlwi 8,8,16 + b .Lppcasm_div6 +.Lppcasm_div5: + divwu 8,3,9 +.Lppcasm_div6: + mullw 12,9,8 + clrlwi 10,5,16 + mullw 6,8,10 + +.Lppcasm_divinnerloop: + subf 10,12,3 + srwi 7,10,16 + addic. 7,7,0 + + + + slwi 7,10,16 + or 7,7,11 + .long 0x7c863840 + bne .Lppcasm_divinnerexit + ble 1,.Lppcasm_divinnerexit + addi 8,8,-1 + subf 12,9,12 + clrlwi 10,5,16 + subf 6,10,6 + b .Lppcasm_divinnerloop +.Lppcasm_divinnerexit: + srwi 10,6,16 + slwi 11,6,16 + .long 0x7c845840 + add 12,12,10 + bge 1,.Lppcasm_div7 + addi 12,12,1 +.Lppcasm_div7: + subf 11,11,4 + .long 0x7c836040 + bge 1,.Lppcasm_div8 + addi 8,8,-1 + add 3,5,3 +.Lppcasm_div8: + subf 12,12,3 + slwi 4,11,16 + + + + insrwi 11,12,16,16 + rotlwi 3,11,16 + bdz .Lppcasm_div9 + slwi 0,8,16 + b .Lppcasm_divouterloop +.Lppcasm_div9: + or 3,8,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_div_words,.-bn_div_words + + + + + + + +.align 4 +bn_sqr_words: + + + + + + + + + + + + + + + addic. 5,5,0 + beq .Lppcasm_sqr_adios + addi 4,4,-4 + addi 3,3,-4 + mtctr 5 +.Lppcasm_sqr_mainloop: + + lwzu 6,4(4) + mullw 7,6,6 + mulhwu 8,6,6 + stwu 7,4(3) + stwu 8,4(3) + bdnz .Lppcasm_sqr_mainloop +.Lppcasm_sqr_adios: + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_sqr_words,.-bn_sqr_words + + + + + + + + +.align 4 +bn_mul_words: + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_mw_REM + mtctr 7 +.Lppcasm_mw_LOOP: + + lwz 8,0(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + + + + + stw 9,0(3) + + lwz 8,4(4) + mullw 11,6,8 + mulhwu 12,6,8 + adde 11,11,10 + + stw 11,4(3) + + lwz 8,8(4) + mullw 9,6,8 + mulhwu 10,6,8 + adde 9,9,12 + + stw 9,8(3) + + lwz 8,12(4) + mullw 11,6,8 + mulhwu 12,6,8 + adde 11,11,10 + addze 12,12 + + stw 11,12(3) + + addi 3,3,16 + addi 4,4,16 + bdnz .Lppcasm_mw_LOOP + +.Lppcasm_mw_REM: + andi. 5,5,0x3 + beq .Lppcasm_mw_OVER + + lwz 8,0(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + stw 9,0(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + + lwz 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + stw 9,4(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + lwz 8,8(4) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + stw 9,8(3) + addi 12,10,0 + +.Lppcasm_mw_OVER: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_words,.-bn_mul_words + + + + + + + + +.align 4 +bn_mul_add_words: + + + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_maw_leftover + mtctr 7 +.Lppcasm_maw_mainloop: + + lwz 8,0(4) + lwz 11,0(3) + mullw 9,6,8 + mulhwu 10,6,8 + addc 9,9,12 + addze 10,10 + addc 9,9,11 + + + + + + + stw 9,0(3) + + + lwz 8,4(4) + lwz 9,4(3) + mullw 11,6,8 + mulhwu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + + stw 11,4(3) + + + lwz 8,8(4) + mullw 9,6,8 + lwz 11,8(3) + mulhwu 10,6,8 + adde 9,9,12 + addze 10,10 + addc 9,9,11 + + stw 9,8(3) + + + lwz 8,12(4) + mullw 11,6,8 + lwz 9,12(3) + mulhwu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + addze 12,12 + stw 11,12(3) + addi 3,3,16 + addi 4,4,16 + bdnz .Lppcasm_maw_mainloop + +.Lppcasm_maw_leftover: + andi. 5,5,0x3 + beq .Lppcasm_maw_adios + addi 3,3,-4 + addi 4,4,-4 + + mtctr 5 + lwzu 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + lwzu 11,4(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + stw 9,0(3) + + bdz .Lppcasm_maw_adios + + lwzu 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + lwzu 11,4(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + stw 9,0(3) + + bdz .Lppcasm_maw_adios + + lwzu 8,4(4) + mullw 9,6,8 + mulhwu 10,6,8 + lwzu 11,4(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + stw 9,0(3) + +.Lppcasm_maw_adios: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_add_words,.-bn_mul_add_words +.align 4 diff --git a/sys/crypto/openssl/powerpc/poly1305-ppc.S b/sys/crypto/openssl/powerpc/poly1305-ppc.S index 5b458a6ec4e52..d6fe346937246 100644 --- a/sys/crypto/openssl/powerpc/poly1305-ppc.S +++ b/sys/crypto/openssl/powerpc/poly1305-ppc.S @@ -11,6 +11,7 @@ poly1305_init_int: stw 0,8(3) stw 0,12(3) stw 0,16(3) + stw 0,24(3) .long 0x7c040040 beq- .Lno_key @@ -46,6 +47,7 @@ poly1305_init_int: .type poly1305_blocks,@function .align 4 poly1305_blocks: +.Lpoly1305_blocks: srwi. 5,5,4 beq- .Labort @@ -243,18 +245,110 @@ poly1305_blocks: .long 0 .byte 0,12,4,1,0x80,18,4,0 .size poly1305_blocks,.-poly1305_blocks - .globl poly1305_emit .type poly1305_emit,@function -.align 4 +.align 5 poly1305_emit: - stwu 1,-96(1) - mflr 0 - stw 28,80(1) - stw 29,84(1) - stw 30,88(1) - stw 31,92(1) - stw 0,100(1) + lwz 0,24(3) + lwz 6,0(3) + lwz 7,4(3) + lwz 8,8(3) + lwz 9,12(3) + lwz 10,16(3) + cmplwi 0,0 + beq .Lemit_base2_32 + + slwi 11,7,26 + srwi 7,7,6 + slwi 12,8,20 + srwi 8,8,12 + addc 6,6,11 + slwi 11,9,14 + srwi 9,9,18 + adde 7,7,12 + slwi 12,10,8 + srwi 10,10,24 + adde 8,8,11 + adde 9,9,12 + addze 10,10 + +.Lemit_base2_32: + addic 0,6,5 + addze 0,7 + addze 0,8 + addze 0,9 + addze 0,10 + + srwi 0,0,2 + neg 0,0 + andi. 0,0,5 + + addc 6,6,0 + lwz 0,0(5) + addze 7,7 + lwz 11,4(5) + addze 8,8 + lwz 12,8(5) + addze 9,9 + lwz 10,12(5) + + addc 6,6,0 + adde 7,7,11 + adde 8,8,12 + adde 9,9,10 + + addi 3,4,-1 + addi 4,4,7 + + stbu 6,1(3) + srwi 6,6,8 + stbu 8,1(4) + srwi 8,8,8 + + stbu 6,1(3) + srwi 6,6,8 + stbu 8,1(4) + srwi 8,8,8 + + stbu 6,1(3) + srwi 6,6,8 + stbu 8,1(4) + srwi 8,8,8 + + stbu 6,1(3) + stbu 8,1(4) + + stbu 7,1(3) + srwi 7,7,8 + stbu 9,1(4) + srwi 9,9,8 + + stbu 7,1(3) + srwi 7,7,8 + stbu 9,1(4) + srwi 9,9,8 + + stbu 7,1(3) + srwi 7,7,8 + stbu 9,1(4) + srwi 9,9,8 + + stbu 7,1(3) + stbu 9,1(4) + + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.size poly1305_emit,.-poly1305_emit +.globl poly1305_blocks_vsx +.type poly1305_blocks_vsx,@function +.align 5 +poly1305_blocks_vsx: + lwz 7,24(3) + cmplwi 5,128 + bge __poly1305_blocks_vsx + cmplwi 7,0 + beq .Lpoly1305_blocks lwz 7,0(3) lwz 8,4(3) @@ -262,51 +356,946 @@ poly1305_emit: lwz 10,12(3) lwz 11,16(3) - addic 28,7,5 - addze 29,8 - addze 30,9 - addze 31,10 - addze 0,11 + slwi 0,8,26 + srwi 8,8,6 + slwi 12,9,20 + srwi 9,9,12 + addc 7,7,0 + slwi 0,10,14 + srwi 10,10,18 + adde 8,8,12 + slwi 12,11,8 + srwi 11,11,24 + adde 9,9,0 + li 0,0 + adde 10,10,12 + addze 11,11 - srwi 0,0,2 - neg 0,0 + stw 7,0(3) + stw 8,4(3) + stw 9,8(3) + stw 10,12(3) + stw 11,16(3) + stw 0,24(3) - andc 7,7,0 - and 28,28,0 - andc 8,8,0 - and 29,29,0 - or 7,7,28 - lwz 28,0(5) - andc 9,9,0 - and 30,30,0 - or 8,8,29 - lwz 29,4(5) - andc 10,10,0 - and 31,31,0 - or 9,9,30 - lwz 30,8(5) - or 10,10,31 - lwz 31,12(5) + b .Lpoly1305_blocks +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.size poly1305_blocks_vsx,.-poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + .long 0x11E05088 + .long 0x12015088 + .long 0x12225088 + .long 0x12435088 + .long 0x12645088 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + .long 0x12823888 + .long 0x11EFA0C0 + .long 0x12833888 + .long 0x1210A0C0 + .long 0x12843888 + .long 0x1231A0C0 + .long 0x12803088 + .long 0x1252A0C0 + .long 0x12813088 + .long 0x1273A0C0 + + .long 0x12814888 + .long 0x11EFA0C0 + .long 0x12824888 + .long 0x1210A0C0 + .long 0x12834888 + .long 0x1231A0C0 + .long 0x12844888 + .long 0x1252A0C0 + .long 0x12804088 + .long 0x1273A0C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 - addc 7,7,28 - adde 8,8,29 - adde 9,9,30 - adde 10,10,31 - li 29,4 - stwbrx 7,0,4 - li 30,8 - stwbrx 8,29,4 - li 31,12 - stwbrx 9,30,4 - stwbrx 10,31,4 - lwz 28,80(1) - lwz 29,84(1) - lwz 30,88(1) - lwz 31,92(1) - addi 1,1,96 blr .long 0 -.byte 0,12,4,1,0x80,4,3,0 -.size poly1305_emit,.-poly1305_emit -.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_blocks_vsx: + stwu 1,-384(1) + mflr 0 + li 10,167 + li 11,183 + mfspr 12,256 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,10,1 + addi 10,10,32 + stvx 24,11,1 + addi 11,11,32 + stvx 25,10,1 + addi 10,10,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + stw 12,360(1) + li 12,-1 + mtspr 256,12 + stw 27,364(1) + stw 28,368(1) + stw 29,372(1) + stw 30,376(1) + stw 31,380(1) + stw 0,388(1) + + bl .LPICmeup + + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + .long 0x7FA06699 + .long 0x7F3B6699 + .long 0x7F7C6699 + .long 0x7FFD6699 + .long 0x7FDE6699 + + cmplwi 7,0 + bne .Lskip_init_vsx + + lwz 8,32(3) + lwz 9,36(3) + lwz 10,40(3) + lwz 11,44(3) + + extrwi 7,8,26,6 + extrwi 8,8,6,0 + insrwi 8,9,20,6 + extrwi 9,9,12,0 + insrwi 9,10,14,6 + extrwi 10,10,18,0 + insrwi 10,11,8,6 + extrwi 11,11,24,0 + + .long 0x7D4701E7 + slwi 7,8,2 + .long 0x7D6801E7 + add 8,8,7 + .long 0x7D8801E7 + slwi 8,9,2 + .long 0x7DA901E7 + add 9,9,8 + .long 0x7DC901E7 + slwi 9,10,2 + .long 0x7CCA01E7 + add 10,10,9 + .long 0x7CEA01E7 + slwi 10,11,2 + .long 0x7D0B01E7 + add 11,11,10 + .long 0x7D2B01E7 + + vor 0,10,10 + vor 1,11,11 + vor 2,13,13 + vor 3,6,6 + vor 4,8,8 + + bl __poly1305_mul + + .long 0xF1405057 + .long 0xF1615857 + .long 0xF1A26857 + .long 0xF0C33057 + .long 0xF1044057 + .long 0xF0000057 + .long 0xF0210857 + .long 0xF0421057 + .long 0xF0631857 + .long 0xF0842057 + .long 0x118BA5C4 + .long 0x11CDA5C4 + .long 0x10E6A5C4 + .long 0x1128A5C4 + .long 0x118C58C0 + .long 0x11CE68C0 + .long 0x10E730C0 + .long 0x112940C0 + + bl __poly1305_mul + + addi 7,3,0x60 + lwz 8,0(3) + lwz 9,4(3) + lwz 10,8(3) + lwz 11,12(3) + lwz 0,16(3) + + .long 0x114A068C + .long 0x116B0E8C + .long 0x11AD168C + .long 0x10C61E8C + .long 0x1108268C + vslw 12,11,20 + vslw 14,13,20 + vslw 7,6,20 + vslw 9,8,20 + vadduwm 12,12,11 + vadduwm 14,14,13 + vadduwm 7,7,6 + vadduwm 9,9,8 + + .long 0x7D5D1F99 + .long 0x7D7E1F99 + .long 0x7D9F1F99 + .long 0x7DA03F99 + .long 0x7DDB3F99 + .long 0x7CDC3F99 + .long 0x7CFD3F99 + .long 0x7D1E3F99 + .long 0x7D3F3F99 + + extrwi 7,8,26,6 + extrwi 8,8,6,0 + .long 0x7C0701E7 + insrwi 8,9,20,6 + extrwi 9,9,12,0 + .long 0x7C2801E7 + insrwi 9,10,14,6 + extrwi 10,10,18,0 + .long 0x7C4901E7 + insrwi 10,11,8,6 + extrwi 11,11,24,0 + .long 0x7C6A01E7 + insrwi 11,0,3,5 + .long 0x7C8B01E7 + li 0,1 + stw 0,24(3) + b .Loaded_vsx + +.align 4 +.Lskip_init_vsx: + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001819 + .long 0x7C3B1819 + .long 0x7C5C1819 + .long 0x7C7D1819 + .long 0x7C9E1819 + +.Loaded_vsx: + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + li 7,0x60 + li 8,0x70 + addi 10,3,64 + addi 11,1,39 + + vxor 20,20,20 + .long 0xF000A057 + .long 0xF021A057 + .long 0xF042A057 + .long 0xF063A057 + .long 0xF084A057 + + .long 0x7F5F6699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + vspltisb 28,14 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x1295CEC4 + .long 0x12D6D6C4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + vand 21,21,29 + vand 20,20,29 + vand 22,22,29 + vand 23,23,29 + + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10D4368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x7D5D1A99 + .long 0x7D605299 + .long 0x7D9B5299 + .long 0x7DBC5299 + .long 0x7DDD5299 + .long 0x7EBE5299 + .long 0x7EDF5299 + .long 0x7EE75299 + .long 0x7F085299 + stvx 11,0,11 + stvx 12,27,11 + stvx 13,28,11 + stvx 14,29,11 + stvx 21,30,11 + stvx 22,31,11 + stvx 23,7,11 + stvx 24,8,11 + + addi 4,4,0x40 + addi 12,12,0x50 + addi 0,5,-64 + srdi 0,0,6 + mtctr 0 + b .Loop_vsx + +.align 4 +.Loop_vsx: + + + + + + + + + + + + + + + .long 0x11E55288 + .long 0x12055A88 + .long 0x12256A88 + .long 0x12466A88 + + .long 0x12865288 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12676A88 + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12875A88 + .long 0x1252A0C0 + lvx 12,31,11 + .long 0x12885A88 + .long 0x1273A0C0 + lvx 11,30,11 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12875288 + .long 0x1231A0C0 + .long 0x12885288 + .long 0x1252A0C0 + lvx 14,8,11 + .long 0x12895288 + .long 0x1273A0C0 + lvx 13,7,11 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + lvx 12,27,11 + .long 0x12815888 + .long 0x1273A0C0 + lvx 11,0,11 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + lvx 14,29,11 + .long 0x12806888 + .long 0x1273A0C0 + lvx 13,28,11 + + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12D6D6C4 + .long 0x1355CEC4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + vand 21,21,29 + vand 26,26,29 + vand 22,22,29 + vand 23,23,29 + + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10DA368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + addi 4,4,0x40 + bdnz .Loop_vsx + + neg 5,5 + andi. 5,5,0x30 + sub 4,4,5 + + .long 0x7D5D1E99 + .long 0x7D605699 + .long 0x7D9B5699 + .long 0x7DBC5699 + .long 0x7DDD5699 + +.Last_vsx: + .long 0x11E55288 + .long 0x12065288 + .long 0x12275288 + .long 0x12485288 + .long 0x12695288 + + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12855A88 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12875A88 + .long 0x1252A0C0 + .long 0x7D9F5699 + .long 0x12885A88 + .long 0x1273A0C0 + .long 0x7D7E5699 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12856A88 + .long 0x1231A0C0 + .long 0x12866A88 + .long 0x1252A0C0 + .long 0x7DC85699 + .long 0x12876A88 + .long 0x1273A0C0 + .long 0x7DA75699 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + .long 0x7D9B5699 + .long 0x12815888 + .long 0x1273A0C0 + .long 0x7D605699 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + .long 0x7DDD5699 + .long 0x12806888 + .long 0x1273A0C0 + .long 0x7DBC5699 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + + + + .long 0xF00F7A57 + .long 0xF0308257 + .long 0xF0518A57 + .long 0xF0729257 + .long 0xF0939A57 + .long 0x11EF00C0 + .long 0x121008C0 + .long 0x123110C0 + .long 0x125218C0 + .long 0x127320C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + beq .Ldone_vsx + + add 6,12,5 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF297C057 + vperm 21,23,24,31 + .long 0xF2D7C357 + + .long 0x7DE03699 + .long 0x7E1D3699 + + .long 0x12F4CEC4 + .long 0x12B5D6C4 + .long 0x1316DEC4 + .long 0x12D6E6C4 + vand 20,20,29 + vand 23,23,29 + vand 21,21,29 + vand 22,22,29 + + + .long 0x11384E8C + .long 0x10B42E8C + .long 0x10D7368C + .long 0x10F53E8C + .long 0x1116468C + vor 9,9,30 + + vperm 0,0,0,15 + vand 5,5, 16 + vperm 1,1,1,15 + vand 6,6, 16 + vperm 2,2,2,15 + vand 7,7, 16 + vperm 3,3,3,15 + vand 8,8, 16 + vperm 4,4,4,15 + vand 9,9, 16 + + .long 0x10A500C0 + vxor 0,0,0 + .long 0x10C608C0 + vxor 1,1,1 + .long 0x10E710C0 + vxor 2,2,2 + .long 0x110818C0 + vxor 3,3,3 + .long 0x112920C0 + vxor 4,4,4 + + xor. 5,5,5 + b .Last_vsx + +.align 4 +.Ldone_vsx: + lwz 0,388(1) + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001919 + .long 0x7C3B1919 + .long 0x7C5C1919 + .long 0x7C7D1919 + .long 0x7C9E1919 + + lwz 12,360(1) + mtlr 0 + li 10,167 + li 11,183 + mtspr 256,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,10,1 + addi 10,10,32 + lvx 22,11,1 + addi 11,11,32 + lvx 23,10,1 + addi 10,10,32 + lvx 24,11,1 + addi 11,11,32 + lvx 25,10,1 + addi 10,10,32 + lvx 26,11,1 + addi 11,11,32 + lvx 27,10,1 + addi 10,10,32 + lvx 28,11,1 + addi 11,11,32 + lvx 29,10,1 + addi 10,10,32 + lvx 30,11,1 + lvx 31,10,1 + lwz 27,364(1) + lwz 28,368(1) + lwz 29,372(1) + lwz 30,376(1) + lwz 31,380(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,0x04,1,0x80,5,4,0 +.long 0 +.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx + +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 12 + addi 12,12,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 + +.long 0x00000000,0x03ffffff +.long 0x00000000,0x03ffffff +.long 0x00000000,0x0000001a +.long 0x00000000,0x0000001a +.long 0x00000000,0x00000028 +.long 0x00000000,0x00000028 +.long 0x00000000,0x0e0f0001 +.long 0x00000000,0x1e1f1011 +.long 0x01000000,0x01000000 +.long 0x01000000,0x01000000 +.long 0x07060504,0x03020100 +.long 0x0f0e0d0c,0x0b0a0908 + +.long 0x00000000,0x00000000 +.long 0x00000000,0x04050607 +.long 0x04050607,0x00000000 +.long 0x00000000,0x00000000 +.long 0x00000000,0x00000000 +.long 0x04050607,0x00000000 + +.long 0xffffffff,0x00000000 +.long 0xffffffff,0xffffffff +.long 0xffffffff,0x00000000 +.long 0xffffffff,0x00000000 +.long 0x00000000,0x00000000 +.long 0xffffffff,0x00000000 +.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 diff --git a/sys/crypto/openssl/powerpc/vpaes-ppc.S b/sys/crypto/openssl/powerpc/vpaes-ppc.S index fd11edc79389d..9a21c796aa814 100644 --- a/sys/crypto/openssl/powerpc/vpaes-ppc.S +++ b/sys/crypto/openssl/powerpc/vpaes-ppc.S @@ -667,7 +667,7 @@ vpaes_cbc_encrypt: vor 24,0,0 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -719,7 +719,7 @@ vpaes_cbc_encrypt: vor 24,25,25 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -1037,7 +1037,7 @@ _vpaes_schedule_core: vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1059,7 +1059,7 @@ _vpaes_schedule_core: addi 9, 5, -15 vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1150,7 +1150,7 @@ _vpaes_schedule_low_round: vsldoi 1, 9, 7, 12 vxor 7, 7, 1 - vspltisb 1, 0x0f + vspltisb 1,0x0f vsldoi 4, 9, 7, 8 @@ -1246,7 +1246,7 @@ _vpaes_schedule_mangle: vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr @@ -1297,7 +1297,7 @@ _vpaes_schedule_mangle: vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr diff --git a/sys/crypto/openssl/powerpc64/bn-ppc.S b/sys/crypto/openssl/powerpc64/bn-ppc.S new file mode 100644 index 0000000000000..50805d3d52bc7 --- /dev/null +++ b/sys/crypto/openssl/powerpc64/bn-ppc.S @@ -0,0 +1,1876 @@ +/* Do not modify. This file is auto-generated from ppc.pl. */ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.globl bn_sub_words +.type bn_sub_words,@function +.globl bn_add_words +.type bn_add_words,@function +.globl bn_div_words +.type bn_div_words,@function +.globl bn_sqr_words +.type bn_sqr_words,@function +.globl bn_mul_words +.type bn_mul_words,@function +.globl bn_mul_add_words +.type bn_mul_add_words,@function + + + +.machine "any" +.abiversion 2 +.text + + + + + + + + +.align 4 +bn_sqr_comba4: +.localentry bn_sqr_comba4,0 + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + + + + + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 9,0 + + + addc 10,7,10 + addze 11,8 + addze 9,9 + + std 10,8(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + + std 9,48(3) + std 10,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba4,.-bn_sqr_comba4 + + + + + + + + +.align 4 +bn_sqr_comba8: +.localentry bn_sqr_comba8,0 + + + + + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,0 + addze 9,0 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + std 10,8(3) + + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,0(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,0(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,48(3) + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,8(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,16(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,56(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,64(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,24(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,32(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,72(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,80(3) + + ld 5,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,40(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,88(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,96(3) + + + ld 5,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,104(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + std 11,112(3) + std 9, 120(3) + + + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba8,.-bn_sqr_comba8 + + + + + + + + +.align 4 +bn_mul_comba4: +.localentry bn_mul_comba4,0 + + + + + + + + + + + + + xor 0,0,0 + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,0 + addze 10,0 + + ld 6, 8(4) + ld 7, 0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + std 10,24(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,0 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,32(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,40(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + + std 10,48(3) + std 11,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba4,.-bn_mul_comba4 + + + + + + + + +.align 4 +bn_mul_comba8: +.localentry bn_mul_comba8,0 + + + + + + + + + + + + + xor 0,0,0 + + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + addze 12,9 + addze 10,0 + + ld 6,8(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,24(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,24(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,0(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,32(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,40(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,40(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,16(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,8(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,0(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,48(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,8(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,48(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,56(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,56(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,48(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,8(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,64(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,24(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,48(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,56(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,72(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,48(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,80(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,40(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,48(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,56(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,88(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,48(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,96(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,56(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,104(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + std 12,112(3) + std 10,120(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba8,.-bn_mul_comba8 + + + + + + + + +.align 4 +bn_sub_words: +.localentry bn_sub_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + subfc. 7,0,6 + + + beq .Lppcasm_sub_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_sub_mainloop: + ldu 7,8(4) + ldu 8,8(5) + subfe 6,8,7 + + + stdu 6,8(3) + bdnz .Lppcasm_sub_mainloop +.Lppcasm_sub_adios: + subfze 3,0 + andi. 3,3,1 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_sub_words,.-bn_sub_words + + + + + + + + +.align 4 +bn_add_words: +.localentry bn_add_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + addic. 6,6,0 + beq .Lppcasm_add_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_add_mainloop: + ldu 7,8(4) + ldu 8,8(5) + adde 8,7,8 + stdu 8,8(3) + bdnz .Lppcasm_add_mainloop +.Lppcasm_add_adios: + addze 3,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_add_words,.-bn_add_words + + + + + + + + +.align 4 +bn_div_words: +.localentry bn_div_words,0 + + + + + + + + + + + + + cmpldi 0,5,0 + bne .Lppcasm_div1 + li 3,-1 + blr +.Lppcasm_div1: + xor 0,0,0 + li 8,64 + cntlzd. 7,5 + beq .Lppcasm_div2 + subf 8,7,8 + srd. 9,3,8 + td 16,9,0 +.Lppcasm_div2: + cmpld 0,3,5 + blt .Lppcasm_div3 + subf 3,5,3 +.Lppcasm_div3: + cmpi 0,0,7,0 + beq .Lppcasm_div4 + sld 3,3,7 + srd 8,4,8 + sld 5,5,7 + or 3,3,8 + sld 4,4,7 +.Lppcasm_div4: + srdi 9,5,32 + + + li 6,2 + mtctr 6 +.Lppcasm_divouterloop: + srdi 8,3,32 + srdi 11,4,32 + + cmpld 0,8,9 + bne .Lppcasm_div5 + + li 8,-1 + clrldi 8,8,32 + b .Lppcasm_div6 +.Lppcasm_div5: + divdu 8,3,9 +.Lppcasm_div6: + mulld 12,9,8 + clrldi 10,5,32 + mulld 6,8,10 + +.Lppcasm_divinnerloop: + subf 10,12,3 + srdi 7,10,32 + addic. 7,7,0 + + + + sldi 7,10,32 + or 7,7,11 + cmpld 1,6,7 + bne .Lppcasm_divinnerexit + ble 1,.Lppcasm_divinnerexit + addi 8,8,-1 + subf 12,9,12 + clrldi 10,5,32 + subf 6,10,6 + b .Lppcasm_divinnerloop +.Lppcasm_divinnerexit: + srdi 10,6,32 + sldi 11,6,32 + cmpld 1,4,11 + add 12,12,10 + bge 1,.Lppcasm_div7 + addi 12,12,1 +.Lppcasm_div7: + subf 11,11,4 + cmpld 1,3,12 + bge 1,.Lppcasm_div8 + addi 8,8,-1 + add 3,5,3 +.Lppcasm_div8: + subf 12,12,3 + sldi 4,11,32 + + + + insrdi 11,12,32,32 + rotldi 3,11,32 + bdz .Lppcasm_div9 + sldi 0,8,32 + b .Lppcasm_divouterloop +.Lppcasm_div9: + or 3,8,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_div_words,.-bn_div_words + + + + + + + +.align 4 +bn_sqr_words: +.localentry bn_sqr_words,0 + + + + + + + + + + + + + + + + addic. 5,5,0 + beq .Lppcasm_sqr_adios + addi 4,4,-8 + addi 3,3,-8 + mtctr 5 +.Lppcasm_sqr_mainloop: + + ldu 6,8(4) + mulld 7,6,6 + mulhdu 8,6,6 + stdu 7,8(3) + stdu 8,8(3) + bdnz .Lppcasm_sqr_mainloop +.Lppcasm_sqr_adios: + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_sqr_words,.-bn_sqr_words + + + + + + + + +.align 4 +bn_mul_words: +.localentry bn_mul_words,0 + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_mw_REM + mtctr 7 +.Lppcasm_mw_LOOP: + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + + + + + std 9,0(3) + + ld 8,8(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + + std 11,8(3) + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + adde 9,9,12 + + std 9,16(3) + + ld 8,24(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + + std 11,24(3) + + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_mw_LOOP + +.Lppcasm_mw_REM: + andi. 5,5,0x3 + beq .Lppcasm_mw_OVER + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,0(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + + ld 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,8(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,16(3) + addi 12,10,0 + +.Lppcasm_mw_OVER: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_words,.-bn_mul_words + + + + + + + + +.align 4 +bn_mul_add_words: +.localentry bn_mul_add_words,0 + + + + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_maw_leftover + mtctr 7 +.Lppcasm_maw_mainloop: + + ld 8,0(4) + ld 11,0(3) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + addc 9,9,11 + + + + + + + std 9,0(3) + + + ld 8,8(4) + ld 9,8(3) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + + std 11,8(3) + + + ld 8,16(4) + mulld 9,6,8 + ld 11,16(3) + mulhdu 10,6,8 + adde 9,9,12 + addze 10,10 + addc 9,9,11 + + std 9,16(3) + + + ld 8,24(4) + mulld 11,6,8 + ld 9,24(3) + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + addze 12,12 + std 11,24(3) + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_maw_mainloop + +.Lppcasm_maw_leftover: + andi. 5,5,0x3 + beq .Lppcasm_maw_adios + addi 3,3,-8 + addi 4,4,-8 + + mtctr 5 + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + +.Lppcasm_maw_adios: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_add_words,.-bn_mul_add_words +.align 4 diff --git a/sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S b/sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S new file mode 100644 index 0000000000000..5905180d168a6 --- /dev/null +++ b/sys/crypto/openssl/powerpc64/ecp_nistp521-ppc64.S @@ -0,0 +1,354 @@ +/* Do not modify. This file is auto-generated from ecp_nistp521-ppc64.pl. */ +.machine "any" +.abiversion 2 +.text + +.globl p521_felem_mul +.type p521_felem_mul,@function +.align 5 +p521_felem_mul: +.localentry p521_felem_mul,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + lxsd 3,0(5) + lxsd 4,8(5) + lxsd 5,16(5) + lxsd 6,24(5) + lxsd 7,32(5) + lxsd 8,40(5) + lxsd 9,48(5) + lxsd 10,56(5) + lxsd 11,64(5) + + .long 0x12ED1823 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13011023 + + xxpermdi 34,37,36,0b00 + .long 0x13211023 + .long 0x132F1E63 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,36,35,0b00 + .long 0x134CB6A3 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + xxpermdi 54,37,36,0b00 + .long 0x136CB6E3 + .long 0x13711EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + xxpermdi 54,38,37,0b00 + .long 0x138CB723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 54,39,38,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + + xxpermdi 33,49,50,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13811723 + + xxpermdi 34,37,36,0b00 + .long 0x13A11763 + .long 0x13B31F63 + + xxpermdi 34,38,37,0b00 + .long 0x13C117A3 + xxpermdi 44,51,52,0b00 + xxpermdi 54,36,35,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,39,38,0b00 + .long 0x13E117E3 + xxpermdi 54,37,36,0b00 + .long 0x13ECB7E3 + .long 0x13F51FE3 + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x10630DC4 + .long 0x10840DC4 + .long 0x10A50DC4 + .long 0x10C60DC4 + .long 0x10E70DC4 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13D55FA3 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13A11763 + + xxpermdi 33,51,52,0b00 + .long 0x13811723 + .long 0x13954F23 + + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + xxpermdi 54,41,40,0b00 + xxpermdi 44,52,53,0b00 + .long 0x136CB6E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + xxpermdi 44,51,52,0b00 + .long 0x134CB6A3 + .long 0x13553EA3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + xxpermdi 44,50,51,0b00 + .long 0x132CB663 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + xxpermdi 44,49,50,0b00 + .long 0x130CB623 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 44,48,49,0b00 + .long 0x12ECB5E3 + + xxpermdi 34,39,38,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13211663 + + xxpermdi 33,51,52,0b00 + .long 0x13011623 + .long 0x13152E23 + + xxpermdi 33,50,51,0b00 + .long 0x12E115E3 + xxpermdi 54,37,36,0b00 + xxpermdi 44,52,53,0b00 + .long 0x12ECB5E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_mul,.-p521_felem_mul + +.globl p521_felem_square +.type p521_felem_square,@function +.align 5 +p521_felem_square: +.localentry p521_felem_square,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x106D0DC4 + .long 0x108E0DC4 + .long 0x10AF0DC4 + .long 0x10D00DC4 + .long 0x10F10DC4 + .long 0x11120DC4 + .long 0x11330DC4 + .long 0x11540DC4 + .long 0x11750DC4 + .long 0x12ED6823 + + .long 0x130D2023 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,37,46,0b00 + .long 0x13211023 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + .long 0x136F7EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + .long 0x138F3723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,39,48,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + .long 0x13F18FE3 + + .long 0x13124623 + + .long 0x13534EA3 + + .long 0x13945723 + + .long 0x13D55FA3 + + mtvsrdd 33,9,8 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13B45F63 + + .long 0x13935F23 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + .long 0x13324E63 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + .long 0x13114E23 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 34,41,40,0b00 + xxpermdi 33,48,49,0b00 + .long 0x12E115E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_square,.-p521_felem_square + diff --git a/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S b/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S index bac074cdcf6f0..251f59855f5d4 100644 --- a/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S +++ b/sys/crypto/openssl/powerpc64/keccak1600-ppc64.S @@ -304,19 +304,19 @@ KeccakF1600: dword_le_load: .localentry dword_le_load,0 - lbzu 0,1(3) - lbzu 4,1(3) - lbzu 5,1(3) + lbz 0,1(3) + lbz 4,2(3) + lbz 5,3(3) insrdi 0,4,8,48 - lbzu 4,1(3) + lbz 4,4(3) insrdi 0,5,8,40 - lbzu 5,1(3) + lbz 5,5(3) insrdi 0,4,8,32 - lbzu 4,1(3) + lbz 4,6(3) insrdi 0,5,8,24 - lbzu 5,1(3) + lbz 5,7(3) insrdi 0,4,8,16 - lbzu 4,1(3) + lbzu 4,8(3) insrdi 0,5,8,8 insrdi 0,4,8,0 blr @@ -579,21 +579,21 @@ SHA3_squeeze: cmpldi 30,8 blt .Lsqueeze_tail - stbu 0,1(29) + stb 0,1(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,2(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,3(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,4(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,5(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,6(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,7(29) srdi 0,0,8 - stbu 0,1(29) + stbu 0,8(29) subic. 30,30,8 beq .Lsqueeze_done diff --git a/sys/crypto/openssl/powerpc64/poly1305-ppc.S b/sys/crypto/openssl/powerpc64/poly1305-ppc.S index d8b1066022933..523a590809cd6 100644 --- a/sys/crypto/openssl/powerpc64/poly1305-ppc.S +++ b/sys/crypto/openssl/powerpc64/poly1305-ppc.S @@ -12,6 +12,7 @@ poly1305_init_int: std 0,0(3) std 0,8(3) std 0,16(3) + stw 0,24(3) cmpld 4,0 beq- .Lno_key @@ -48,6 +49,7 @@ poly1305_init_int: poly1305_blocks: .localentry poly1305_blocks,0 +.Lpoly1305_blocks: srdi. 5,5,4 beq- .Labort @@ -138,48 +140,1003 @@ poly1305_blocks: .long 0 .byte 0,12,4,1,0x80,5,4,0 .size poly1305_blocks,.-poly1305_blocks - .globl poly1305_emit .type poly1305_emit,@function -.align 4 +.align 5 poly1305_emit: .localentry poly1305_emit,0 - ld 7,0(3) - ld 8,8(3) - ld 9,16(3) - ld 6,0(5) - ld 5,8(5) + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + lwz 0,24(3) + + sldi 8,8,26 + sldi 12,9,52 + srdi 9,9,12 + sldi 10,10,14 + add 7,7,8 + addc 7,7,12 + sldi 12,11,40 + srdi 11,11,24 + adde 8,9,10 + addc 8,8,12 + addze 9,11 + + ld 10,0(3) + ld 11,8(3) + ld 12,16(3) + + neg 0,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 + and 7,7,0 + and 8,8,0 + and 9,9,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 addic 10,7,5 addze 11,8 addze 12,9 - srdi 0,12,2 - neg 0,0 + srdi 12,12,2 + neg 12,12 - andc 7,7,0 - and 10,10,0 - andc 8,8,0 - and 11,11,0 + andc 7,7,12 + and 10,10,12 + andc 8,8,12 + and 11,11,12 or 7,7,10 or 8,8,11 - rotldi 6,6,32 - rotldi 5,5,32 - addc 7,7,6 - adde 8,8,5 - rldicl 0,7,32,32 - li 10,4 - stwbrx 7,0,4 - rldicl 7,8,32,32 - li 11,8 - stwbrx 0,10,4 - li 12,12 - stwbrx 8,11,4 - stwbrx 7,12,4 + + lwz 12,4(5) + lwz 9,12(5) + lwz 10,0(5) + lwz 11,8(5) + + insrdi 10,12,32,0 + insrdi 11,9,32,0 + + addc 7,7,10 + adde 8,8,11 + + addi 3,4,-1 + addi 4,4,7 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + stbu 8,1(4) + blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .size poly1305_emit,.-poly1305_emit -.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl poly1305_blocks_vsx +.type poly1305_blocks_vsx,@function +.align 5 +poly1305_blocks_vsx: +.localentry poly1305_blocks_vsx,0 + + lwz 7,24(3) + cmpldi 5,128 + bge __poly1305_blocks_vsx + + neg 0,7 + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + + sldi 8,8,26 + sldi 12,9,52 + add 7,7,8 + srdi 9,9,12 + sldi 10,10,14 + addc 7,7,12 + sldi 8,11,40 + adde 9,9,10 + srdi 11,11,24 + addc 9,9,8 + addze 11,11 + + ld 8,0(3) + ld 10,8(3) + ld 12,16(3) + + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + and 7,7,0 + and 9,9,0 + and 11,11,0 + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + + li 0,0 + std 7,0(3) + std 9,8(3) + std 11,16(3) + stw 0,24(3) + + b .Lpoly1305_blocks +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.size poly1305_blocks_vsx,.-poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + mulld 9,6,27 + mulhdu 10,6,27 + + mulld 30,7,29 + mulhdu 31,7,29 + addc 9,9,30 + adde 10,10,31 + + mulld 30,6,28 + mulhdu 11,6,28 + addc 10,10,30 + addze 11,11 + + mulld 30,7,27 + mulhdu 31,7,27 + addc 10,10,30 + adde 11,11,31 + + mulld 30,8,29 + mulld 31,8,27 + addc 10,10,30 + adde 11,11,31 + + andc 30,11,0 + and 8,11,0 + srdi 31,30,2 + add 30,30,31 + addc 6,9,30 + addze 7,10 + addze 8,8 + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_splat: + rldicl 9,6,0,38 + rldicl 10,6,38,38 + stw 9,0x00(31) + + rldicl 11,6,12,52 + slwi 9,10,2 + stw 10,0x10(31) + add 9,9,10 + stw 9,0x20(31) + + insrdi 11,7,14,38 + slwi 9,11,2 + stw 11,0x30(31) + add 9,9,11 + stw 9,0x40(31) + + rldicl 10,7,50,38 + rldicl 11,7,24,40 + slwi 9,10,2 + stw 10,0x50(31) + add 9,9,10 + stw 9,0x60(31) + + insrdi 11,8,3,37 + slwi 9,11,2 + stw 11,0x70(31) + add 9,9,11 + stw 9,0x80(31) + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_splat,.-__poly1305_splat + +.align 5 +__poly1305_blocks_vsx: + stdu 1,-432(1) + mflr 0 + li 10,191 + li 11,207 + li 12,-1 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,10,1 + addi 10,10,32 + stvx 24,11,1 + addi 11,11,32 + stvx 25,10,1 + addi 10,10,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + stw 12,388(1) + li 12,-1 + or 12,12,12 + std 27,392(1) + std 28,400(1) + std 29,408(1) + std 30,416(1) + std 31,424(1) + std 0,448(1) + + bl .LPICmeup + + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + .long 0x7FA06699 + .long 0x7F3B6699 + .long 0x7F7C6699 + .long 0x7FFD6699 + .long 0x7FDE6699 + + cmplwi 7,0 + bne .Lskip_init_vsx + + ld 27,32(3) + ld 28,40(3) + srdi 29,28,2 + li 0,3 + add 29,29,28 + + mr 6,27 + mr 7,28 + li 8,0 + addi 31,3,56 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,48 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,60 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,52 + bl __poly1305_splat + + ld 6,0(3) + ld 7,8(3) + ld 8,16(3) + + rldicl 9,6,0,38 + rldicl 10,6,38,38 + rldicl 11,6,12,52 + .long 0x7C0901E7 + insrdi 11,7,14,38 + .long 0x7C2A01E7 + rldicl 10,7,50,38 + .long 0x7C4B01E7 + rldicl 11,7,24,40 + .long 0x7C6A01E7 + insrdi 11,8,3,37 + .long 0x7C8B01E7 + li 0,1 + stw 0,24(3) + b .Loaded_vsx + +.align 4 +.Lskip_init_vsx: + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001819 + .long 0x7C3B1819 + .long 0x7C5C1819 + .long 0x7C7D1819 + .long 0x7C9E1819 + +.Loaded_vsx: + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + li 7,0x60 + li 8,0x70 + addi 10,3,64 + addi 11,1,63 + + vxor 20,20,20 + .long 0xF000A057 + .long 0xF021A057 + .long 0xF042A057 + .long 0xF063A057 + .long 0xF084A057 + + .long 0x7F5F6699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + vspltisb 28,14 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x1295CEC4 + .long 0x12D6D6C4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + vand 21,21,29 + vand 20,20,29 + vand 22,22,29 + vand 23,23,29 + + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10D4368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x7D5D1A99 + .long 0x7D605299 + .long 0x7D9B5299 + .long 0x7DBC5299 + .long 0x7DDD5299 + .long 0x7EBE5299 + .long 0x7EDF5299 + .long 0x7EE75299 + .long 0x7F085299 + stvx 11,0,11 + stvx 12,27,11 + stvx 13,28,11 + stvx 14,29,11 + stvx 21,30,11 + stvx 22,31,11 + stvx 23,7,11 + stvx 24,8,11 + + addi 4,4,0x40 + addi 12,12,0x50 + addi 0,5,-64 + srdi 0,0,6 + mtctr 0 + b .Loop_vsx + +.align 4 +.Loop_vsx: + + + + + + + + + + + + + + + .long 0x11E55288 + .long 0x12055A88 + .long 0x12256A88 + .long 0x12466A88 + + .long 0x12865288 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12676A88 + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12875A88 + .long 0x1252A0C0 + lvx 12,31,11 + .long 0x12885A88 + .long 0x1273A0C0 + lvx 11,30,11 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12875288 + .long 0x1231A0C0 + .long 0x12885288 + .long 0x1252A0C0 + lvx 14,8,11 + .long 0x12895288 + .long 0x1273A0C0 + lvx 13,7,11 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + lvx 12,27,11 + .long 0x12815888 + .long 0x1273A0C0 + lvx 11,0,11 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + lvx 14,29,11 + .long 0x12806888 + .long 0x1273A0C0 + lvx 13,28,11 + + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12D6D6C4 + .long 0x1355CEC4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + vand 21,21,29 + vand 26,26,29 + vand 22,22,29 + vand 23,23,29 + + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10DA368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + addi 4,4,0x40 + bdnz .Loop_vsx + + neg 5,5 + andi. 5,5,0x30 + sub 4,4,5 + + .long 0x7D5D1E99 + .long 0x7D605699 + .long 0x7D9B5699 + .long 0x7DBC5699 + .long 0x7DDD5699 + +.Last_vsx: + .long 0x11E55288 + .long 0x12065288 + .long 0x12275288 + .long 0x12485288 + .long 0x12695288 + + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12855A88 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12875A88 + .long 0x1252A0C0 + .long 0x7D9F5699 + .long 0x12885A88 + .long 0x1273A0C0 + .long 0x7D7E5699 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12856A88 + .long 0x1231A0C0 + .long 0x12866A88 + .long 0x1252A0C0 + .long 0x7DC85699 + .long 0x12876A88 + .long 0x1273A0C0 + .long 0x7DA75699 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + .long 0x7D9B5699 + .long 0x12815888 + .long 0x1273A0C0 + .long 0x7D605699 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + .long 0x7DDD5699 + .long 0x12806888 + .long 0x1273A0C0 + .long 0x7DBC5699 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + + + + .long 0xF00F7A57 + .long 0xF0308257 + .long 0xF0518A57 + .long 0xF0729257 + .long 0xF0939A57 + .long 0x11EF00C0 + .long 0x121008C0 + .long 0x123110C0 + .long 0x125218C0 + .long 0x127320C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + beq .Ldone_vsx + + add 6,12,5 + + .long 0x7F406699 + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + vperm 21,21,21,26 + vperm 22,22,22,26 + vperm 23,23,23,26 + vperm 24,24,24,26 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF297C057 + vperm 21,23,24,31 + .long 0xF2D7C357 + + .long 0x7DE03699 + .long 0x7E1D3699 + + .long 0x12F4CEC4 + .long 0x12B5D6C4 + .long 0x1316DEC4 + .long 0x12D6E6C4 + vand 20,20,29 + vand 23,23,29 + vand 21,21,29 + vand 22,22,29 + + + .long 0x11384E8C + .long 0x10B42E8C + .long 0x10D7368C + .long 0x10F53E8C + .long 0x1116468C + vor 9,9,30 + + vperm 0,0,0,15 + vand 5,5, 16 + vperm 1,1,1,15 + vand 6,6, 16 + vperm 2,2,2,15 + vand 7,7, 16 + vperm 3,3,3,15 + vand 8,8, 16 + vperm 4,4,4,15 + vand 9,9, 16 + + .long 0x10A500C0 + vxor 0,0,0 + .long 0x10C608C0 + vxor 1,1,1 + .long 0x10E710C0 + vxor 2,2,2 + .long 0x110818C0 + vxor 3,3,3 + .long 0x112920C0 + vxor 4,4,4 + + xor. 5,5,5 + b .Last_vsx + +.align 4 +.Ldone_vsx: + ld 0,448(1) + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001919 + .long 0x7C3B1919 + .long 0x7C5C1919 + .long 0x7C7D1919 + .long 0x7C9E1919 + + lwz 12,388(1) + mtlr 0 + li 10,191 + li 11,207 + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,10,1 + addi 10,10,32 + lvx 22,11,1 + addi 11,11,32 + lvx 23,10,1 + addi 10,10,32 + lvx 24,11,1 + addi 11,11,32 + lvx 25,10,1 + addi 10,10,32 + lvx 26,11,1 + addi 11,11,32 + lvx 27,10,1 + addi 10,10,32 + lvx 28,11,1 + addi 11,11,32 + lvx 29,10,1 + addi 10,10,32 + lvx 30,11,1 + lvx 31,10,1 + ld 27,392(1) + ld 28,400(1) + ld 29,408(1) + ld 30,416(1) + ld 31,424(1) + addi 1,1,432 + blr +.long 0 +.byte 0,12,0x04,1,0x80,5,4,0 +.long 0 +.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx + +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 12 + addi 12,12,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 + +.long 0x00000000,0x03ffffff +.long 0x00000000,0x03ffffff +.long 0x00000000,0x0000001a +.long 0x00000000,0x0000001a +.long 0x00000000,0x00000028 +.long 0x00000000,0x00000028 +.long 0x00000000,0x0e0f0001 +.long 0x00000000,0x1e1f1011 +.long 0x01000000,0x01000000 +.long 0x01000000,0x01000000 +.long 0x07060504,0x03020100 +.long 0x0f0e0d0c,0x0b0a0908 + +.long 0x00000000,0x00000000 +.long 0x00000000,0x04050607 +.long 0x04050607,0x00000000 +.long 0x00000000,0x00000000 +.long 0x00000000,0x00000000 +.long 0x04050607,0x00000000 + +.long 0xffffffff,0x00000000 +.long 0xffffffff,0xffffffff +.long 0xffffffff,0x00000000 +.long 0xffffffff,0x00000000 +.long 0x00000000,0x00000000 +.long 0xffffffff,0x00000000 +.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 diff --git a/sys/crypto/openssl/powerpc64/vpaes-ppc.S b/sys/crypto/openssl/powerpc64/vpaes-ppc.S index bf0e1b3b89999..e92c9cd555262 100644 --- a/sys/crypto/openssl/powerpc64/vpaes-ppc.S +++ b/sys/crypto/openssl/powerpc64/vpaes-ppc.S @@ -674,7 +674,7 @@ vpaes_cbc_encrypt: vor 24,0,0 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -726,7 +726,7 @@ vpaes_cbc_encrypt: vor 24,25,25 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -1044,7 +1044,7 @@ _vpaes_schedule_core: vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1066,7 +1066,7 @@ _vpaes_schedule_core: addi 9, 5, -15 vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1157,7 +1157,7 @@ _vpaes_schedule_low_round: vsldoi 1, 9, 7, 12 vxor 7, 7, 1 - vspltisb 1, 0x0f + vspltisb 1,0x0f vsldoi 4, 9, 7, 8 @@ -1253,7 +1253,7 @@ _vpaes_schedule_mangle: vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr @@ -1304,7 +1304,7 @@ _vpaes_schedule_mangle: vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr diff --git a/sys/crypto/openssl/powerpc64le/bn-ppc.S b/sys/crypto/openssl/powerpc64le/bn-ppc.S new file mode 100644 index 0000000000000..50805d3d52bc7 --- /dev/null +++ b/sys/crypto/openssl/powerpc64le/bn-ppc.S @@ -0,0 +1,1876 @@ +/* Do not modify. This file is auto-generated from ppc.pl. */ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.globl bn_sqr_comba4 +.type bn_sqr_comba4,@function +.globl bn_sqr_comba8 +.type bn_sqr_comba8,@function +.globl bn_mul_comba4 +.type bn_mul_comba4,@function +.globl bn_mul_comba8 +.type bn_mul_comba8,@function +.globl bn_sub_words +.type bn_sub_words,@function +.globl bn_add_words +.type bn_add_words,@function +.globl bn_div_words +.type bn_div_words,@function +.globl bn_sqr_words +.type bn_sqr_words,@function +.globl bn_mul_words +.type bn_mul_words,@function +.globl bn_mul_add_words +.type bn_mul_add_words,@function + + + +.machine "any" +.abiversion 2 +.text + + + + + + + + +.align 4 +bn_sqr_comba4: +.localentry bn_sqr_comba4,0 + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + + + + + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 9,0 + + + addc 10,7,10 + addze 11,8 + addze 9,9 + + std 10,8(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 7,7,7 + adde 8,8,8 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 7,7,7 + adde 8,8,8 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + + std 9,48(3) + std 10,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba4,.-bn_sqr_comba4 + + + + + + + + +.align 4 +bn_sqr_comba8: +.localentry bn_sqr_comba8,0 + + + + + + + + + + + + + + + + + + + + + xor 0,0,0 + + + + ld 5,0(4) + mulld 9,5,5 + mulhdu 10,5,5 + std 9,0(3) + + ld 6,8(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,0 + addze 9,0 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + std 10,8(3) + + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + std 11,16(3) + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + std 9,24(3) + + mulld 7,6,6 + mulhdu 8,6,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,0(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,32(3) + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,24(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,40(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,8(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,0(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,48(3) + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,8(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,16(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,56(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,16(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,8(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,64(3) + + ld 5,16(4) + mulld 7,5,6 + mulhdu 8,5,6 + + addc 9,7,9 + adde 10,8,10 + addze 11,0 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,24(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + + ld 5,32(4) + ld 6,40(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,72(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + + ld 5,24(4) + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,80(3) + + ld 5,32(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,0 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + + ld 5,40(4) + ld 6,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + addc 11,7,11 + adde 9,8,9 + addze 10,10 + std 11,88(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 9,7,9 + adde 10,8,10 + addze 11,0 + + ld 6,56(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + addc 9,7,9 + adde 10,8,10 + addze 11,11 + std 9,96(3) + + + ld 5,48(4) + mulld 7,5,6 + mulhdu 8,5,6 + addc 10,7,10 + adde 11,8,11 + addze 9,0 + addc 10,7,10 + adde 11,8,11 + addze 9,9 + std 10,104(3) + + mulld 7,6,6 + mulhdu 8,6,6 + addc 11,7,11 + adde 9,8,9 + std 11,112(3) + std 9, 120(3) + + + blr +.long 0 +.byte 0,12,0x14,0,0,0,2,0 +.long 0 +.size bn_sqr_comba8,.-bn_sqr_comba8 + + + + + + + + +.align 4 +bn_mul_comba4: +.localentry bn_mul_comba4,0 + + + + + + + + + + + + + xor 0,0,0 + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,0 + addze 10,0 + + ld 6, 8(4) + ld 7, 0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + addze 12,12 + std 10,24(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,0 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,8,11 + adde 12,9,12 + addze 10,10 + std 11,32(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,0 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,8,12 + adde 10,9,10 + addze 11,11 + std 12,40(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,8,10 + adde 11,9,11 + + std 10,48(3) + std 11,56(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba4,.-bn_mul_comba4 + + + + + + + + +.align 4 +bn_mul_comba8: +.localentry bn_mul_comba8,0 + + + + + + + + + + + + + xor 0,0,0 + + + ld 6,0(4) + ld 7,0(5) + mulld 10,6,7 + mulhdu 11,6,7 + std 10,0(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + addze 12,9 + addze 10,0 + + ld 6,8(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,8(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,0(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,16(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,8(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + + ld 6,16(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,24(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,24(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,8(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,0(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,32(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,8(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,40(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,40(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,24(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,16(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,8(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,0(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,48(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,8(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,16(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,48(4) + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,56(4) + ld 7,0(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,56(3) + + ld 7,8(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,48(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,40(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,32(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,24(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,16(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,8(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,64(3) + + ld 6,16(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,24(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,32(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,48(4) + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,56(4) + ld 7,16(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,72(3) + + ld 7,24(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,48(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,40(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,32(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + + ld 6,24(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,80(3) + + ld 6,32(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,0 + + ld 6,40(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,48(4) + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + + ld 6,56(4) + ld 7,32(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + addze 11,11 + std 12,88(3) + + ld 7,40(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,0 + + ld 6,48(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + + ld 6,40(4) + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 10,10,8 + adde 11,11,9 + addze 12,12 + std 10,96(3) + + ld 6,48(4) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,0 + + ld 6,56(4) + ld 7,48(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 11,11,8 + adde 12,12,9 + addze 10,10 + std 11,104(3) + + ld 7,56(5) + mulld 8,6,7 + mulhdu 9,6,7 + addc 12,12,8 + adde 10,10,9 + std 12,112(3) + std 10,120(3) + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_mul_comba8,.-bn_mul_comba8 + + + + + + + + +.align 4 +bn_sub_words: +.localentry bn_sub_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + subfc. 7,0,6 + + + beq .Lppcasm_sub_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_sub_mainloop: + ldu 7,8(4) + ldu 8,8(5) + subfe 6,8,7 + + + stdu 6,8(3) + bdnz .Lppcasm_sub_mainloop +.Lppcasm_sub_adios: + subfze 3,0 + andi. 3,3,1 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_sub_words,.-bn_sub_words + + + + + + + + +.align 4 +bn_add_words: +.localentry bn_add_words,0 + + + + + + + + + + + + + + + xor 0,0,0 + + + + addic. 6,6,0 + beq .Lppcasm_add_adios + addi 4,4,-8 + addi 3,3,-8 + addi 5,5,-8 + mtctr 6 +.Lppcasm_add_mainloop: + ldu 7,8(4) + ldu 8,8(5) + adde 8,7,8 + stdu 8,8(3) + bdnz .Lppcasm_add_mainloop +.Lppcasm_add_adios: + addze 3,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_add_words,.-bn_add_words + + + + + + + + +.align 4 +bn_div_words: +.localentry bn_div_words,0 + + + + + + + + + + + + + cmpldi 0,5,0 + bne .Lppcasm_div1 + li 3,-1 + blr +.Lppcasm_div1: + xor 0,0,0 + li 8,64 + cntlzd. 7,5 + beq .Lppcasm_div2 + subf 8,7,8 + srd. 9,3,8 + td 16,9,0 +.Lppcasm_div2: + cmpld 0,3,5 + blt .Lppcasm_div3 + subf 3,5,3 +.Lppcasm_div3: + cmpi 0,0,7,0 + beq .Lppcasm_div4 + sld 3,3,7 + srd 8,4,8 + sld 5,5,7 + or 3,3,8 + sld 4,4,7 +.Lppcasm_div4: + srdi 9,5,32 + + + li 6,2 + mtctr 6 +.Lppcasm_divouterloop: + srdi 8,3,32 + srdi 11,4,32 + + cmpld 0,8,9 + bne .Lppcasm_div5 + + li 8,-1 + clrldi 8,8,32 + b .Lppcasm_div6 +.Lppcasm_div5: + divdu 8,3,9 +.Lppcasm_div6: + mulld 12,9,8 + clrldi 10,5,32 + mulld 6,8,10 + +.Lppcasm_divinnerloop: + subf 10,12,3 + srdi 7,10,32 + addic. 7,7,0 + + + + sldi 7,10,32 + or 7,7,11 + cmpld 1,6,7 + bne .Lppcasm_divinnerexit + ble 1,.Lppcasm_divinnerexit + addi 8,8,-1 + subf 12,9,12 + clrldi 10,5,32 + subf 6,10,6 + b .Lppcasm_divinnerloop +.Lppcasm_divinnerexit: + srdi 10,6,32 + sldi 11,6,32 + cmpld 1,4,11 + add 12,12,10 + bge 1,.Lppcasm_div7 + addi 12,12,1 +.Lppcasm_div7: + subf 11,11,4 + cmpld 1,3,12 + bge 1,.Lppcasm_div8 + addi 8,8,-1 + add 3,5,3 +.Lppcasm_div8: + subf 12,12,3 + sldi 4,11,32 + + + + insrdi 11,12,32,32 + rotldi 3,11,32 + bdz .Lppcasm_div9 + sldi 0,8,32 + b .Lppcasm_divouterloop +.Lppcasm_div9: + or 3,8,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_div_words,.-bn_div_words + + + + + + + +.align 4 +bn_sqr_words: +.localentry bn_sqr_words,0 + + + + + + + + + + + + + + + + addic. 5,5,0 + beq .Lppcasm_sqr_adios + addi 4,4,-8 + addi 3,3,-8 + mtctr 5 +.Lppcasm_sqr_mainloop: + + ldu 6,8(4) + mulld 7,6,6 + mulhdu 8,6,6 + stdu 7,8(3) + stdu 8,8(3) + bdnz .Lppcasm_sqr_mainloop +.Lppcasm_sqr_adios: + blr +.long 0 +.byte 0,12,0x14,0,0,0,3,0 +.long 0 +.size bn_sqr_words,.-bn_sqr_words + + + + + + + + +.align 4 +bn_mul_words: +.localentry bn_mul_words,0 + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_mw_REM + mtctr 7 +.Lppcasm_mw_LOOP: + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + + + + + std 9,0(3) + + ld 8,8(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + + std 11,8(3) + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + adde 9,9,12 + + std 9,16(3) + + ld 8,24(4) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + + std 11,24(3) + + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_mw_LOOP + +.Lppcasm_mw_REM: + andi. 5,5,0x3 + beq .Lppcasm_mw_OVER + + ld 8,0(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,0(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + + ld 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,8(3) + addi 12,10,0 + + addi 5,5,-1 + cmpli 0,0,5,0 + beq .Lppcasm_mw_OVER + + + ld 8,16(4) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + std 9,16(3) + addi 12,10,0 + +.Lppcasm_mw_OVER: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_words,.-bn_mul_words + + + + + + + + +.align 4 +bn_mul_add_words: +.localentry bn_mul_add_words,0 + + + + + + + + + + + + xor 0,0,0 + xor 12,12,12 + rlwinm. 7,5,30,2,31 + beq .Lppcasm_maw_leftover + mtctr 7 +.Lppcasm_maw_mainloop: + + ld 8,0(4) + ld 11,0(3) + mulld 9,6,8 + mulhdu 10,6,8 + addc 9,9,12 + addze 10,10 + addc 9,9,11 + + + + + + + std 9,0(3) + + + ld 8,8(4) + ld 9,8(3) + mulld 11,6,8 + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + + std 11,8(3) + + + ld 8,16(4) + mulld 9,6,8 + ld 11,16(3) + mulhdu 10,6,8 + adde 9,9,12 + addze 10,10 + addc 9,9,11 + + std 9,16(3) + + + ld 8,24(4) + mulld 11,6,8 + ld 9,24(3) + mulhdu 12,6,8 + adde 11,11,10 + addze 12,12 + addc 11,11,9 + addze 12,12 + std 11,24(3) + addi 3,3,32 + addi 4,4,32 + bdnz .Lppcasm_maw_mainloop + +.Lppcasm_maw_leftover: + andi. 5,5,0x3 + beq .Lppcasm_maw_adios + addi 3,3,-8 + addi 4,4,-8 + + mtctr 5 + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + + bdz .Lppcasm_maw_adios + + ldu 8,8(4) + mulld 9,6,8 + mulhdu 10,6,8 + ldu 11,8(3) + addc 9,9,11 + addze 10,10 + addc 9,9,12 + addze 12,10 + std 9,0(3) + +.Lppcasm_maw_adios: + addi 3,12,0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.long 0 +.size bn_mul_add_words,.-bn_mul_add_words +.align 4 diff --git a/sys/crypto/openssl/powerpc64le/ecp_nistp521-ppc64.S b/sys/crypto/openssl/powerpc64le/ecp_nistp521-ppc64.S new file mode 100644 index 0000000000000..5905180d168a6 --- /dev/null +++ b/sys/crypto/openssl/powerpc64le/ecp_nistp521-ppc64.S @@ -0,0 +1,354 @@ +/* Do not modify. This file is auto-generated from ecp_nistp521-ppc64.pl. */ +.machine "any" +.abiversion 2 +.text + +.globl p521_felem_mul +.type p521_felem_mul,@function +.align 5 +p521_felem_mul: +.localentry p521_felem_mul,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + lxsd 3,0(5) + lxsd 4,8(5) + lxsd 5,16(5) + lxsd 6,24(5) + lxsd 7,32(5) + lxsd 8,40(5) + lxsd 9,48(5) + lxsd 10,56(5) + lxsd 11,64(5) + + .long 0x12ED1823 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13011023 + + xxpermdi 34,37,36,0b00 + .long 0x13211023 + .long 0x132F1E63 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,36,35,0b00 + .long 0x134CB6A3 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + xxpermdi 54,37,36,0b00 + .long 0x136CB6E3 + .long 0x13711EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + xxpermdi 54,38,37,0b00 + .long 0x138CB723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 54,39,38,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + + xxpermdi 33,49,50,0b00 + xxpermdi 34,36,35,0b00 + .long 0x13811723 + + xxpermdi 34,37,36,0b00 + .long 0x13A11763 + .long 0x13B31F63 + + xxpermdi 34,38,37,0b00 + .long 0x13C117A3 + xxpermdi 44,51,52,0b00 + xxpermdi 54,36,35,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,39,38,0b00 + .long 0x13E117E3 + xxpermdi 54,37,36,0b00 + .long 0x13ECB7E3 + .long 0x13F51FE3 + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x10630DC4 + .long 0x10840DC4 + .long 0x10A50DC4 + .long 0x10C60DC4 + .long 0x10E70DC4 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13D55FA3 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13A11763 + + xxpermdi 33,51,52,0b00 + .long 0x13811723 + .long 0x13954F23 + + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + xxpermdi 54,41,40,0b00 + xxpermdi 44,52,53,0b00 + .long 0x136CB6E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + xxpermdi 44,51,52,0b00 + .long 0x134CB6A3 + .long 0x13553EA3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + xxpermdi 44,50,51,0b00 + .long 0x132CB663 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + xxpermdi 44,49,50,0b00 + .long 0x130CB623 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 44,48,49,0b00 + .long 0x12ECB5E3 + + xxpermdi 34,39,38,0b00 + xxpermdi 33,52,53,0b00 + .long 0x13211663 + + xxpermdi 33,51,52,0b00 + .long 0x13011623 + .long 0x13152E23 + + xxpermdi 33,50,51,0b00 + .long 0x12E115E3 + xxpermdi 54,37,36,0b00 + xxpermdi 44,52,53,0b00 + .long 0x12ECB5E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_mul,.-p521_felem_mul + +.globl p521_felem_square +.type p521_felem_square,@function +.align 5 +p521_felem_square: +.localentry p521_felem_square,0 + + + mr 12,1 + stdu 1,-16*13(1) + + stxv 52,-16*12(12) + stxv 53,-16*11(12) + stxv 54,-16*10(12) + stxv 55,-16*9(12) + stxv 56,-16*8(12) + stxv 57,-16*7(12) + stxv 58,-16*6(12) + stxv 59,-16*5(12) + stxv 60,-16*4(12) + stxv 61,-16*3(12) + stxv 62,-16*2(12) + stxv 63,-16*1(12) + + vspltisw 0,0 + + lxsd 13,0(4) + lxsd 14,8(4) + lxsd 15,16(4) + lxsd 16,24(4) + lxsd 17,32(4) + lxsd 18,40(4) + lxsd 19,48(4) + lxsd 20,56(4) + lxsd 21,64(4) + + li 8,0 + li 9,1 + mtvsrdd 33,9,8 + .long 0x106D0DC4 + .long 0x108E0DC4 + .long 0x10AF0DC4 + .long 0x10D00DC4 + .long 0x10F10DC4 + .long 0x11120DC4 + .long 0x11330DC4 + .long 0x11540DC4 + .long 0x11750DC4 + .long 0x12ED6823 + + .long 0x130D2023 + + xxpermdi 33,45,46,0b00 + xxpermdi 34,37,46,0b00 + .long 0x13211023 + + xxpermdi 34,38,37,0b00 + .long 0x13411023 + + xxpermdi 34,39,38,0b00 + .long 0x13611023 + .long 0x136F7EE3 + + xxpermdi 34,40,39,0b00 + .long 0x13811023 + .long 0x138F3723 + + xxpermdi 34,41,40,0b00 + .long 0x13A11023 + xxpermdi 44,47,48,0b00 + xxpermdi 54,39,48,0b00 + .long 0x13ACB763 + + xxpermdi 34,42,41,0b00 + .long 0x13C11023 + xxpermdi 54,40,39,0b00 + .long 0x13CCB7A3 + + xxpermdi 34,43,42,0b00 + .long 0x13E11023 + xxpermdi 54,41,40,0b00 + .long 0x13ECB7E3 + .long 0x13F18FE3 + + .long 0x13124623 + + .long 0x13534EA3 + + .long 0x13945723 + + .long 0x13D55FA3 + + mtvsrdd 33,9,8 + .long 0x11080DC4 + .long 0x11290DC4 + .long 0x114A0DC4 + .long 0x116B0DC4 + + .long 0x13B45F63 + + .long 0x13935F23 + + xxpermdi 34,43,42,0b00 + xxpermdi 33,50,51,0b00 + .long 0x136116E3 + + xxpermdi 33,49,50,0b00 + .long 0x134116A3 + + xxpermdi 33,48,49,0b00 + .long 0x13211663 + .long 0x13324E63 + + xxpermdi 33,47,48,0b00 + .long 0x13011623 + .long 0x13114E23 + + xxpermdi 33,46,47,0b00 + .long 0x12E115E3 + xxpermdi 34,41,40,0b00 + xxpermdi 33,48,49,0b00 + .long 0x12E115E3 + + stxv 55,0(3) + stxv 56,16(3) + stxv 57,32(3) + stxv 58,48(3) + stxv 59,64(3) + stxv 60,80(3) + stxv 61,96(3) + stxv 62,112(3) + stxv 63,128(3) + + ld 12,0(1) + lxv 52,-16*12(12) + lxv 53,-16*11(12) + lxv 54,-16*10(12) + lxv 55,-16*9(12) + lxv 56,-16*8(12) + lxv 57,-16*7(12) + lxv 58,-16*6(12) + lxv 59,-16*5(12) + lxv 60,-16*4(12) + lxv 61,-16*3(12) + lxv 62,-16*2(12) + lxv 63,-16*1(12) + mr 1,12 + + blr +.size p521_felem_square,.-p521_felem_square + diff --git a/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S b/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S index 1fb70dd39b256..3451068190b8e 100644 --- a/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S +++ b/sys/crypto/openssl/powerpc64le/keccak1600-ppc64.S @@ -304,19 +304,19 @@ KeccakF1600: dword_le_load: .localentry dword_le_load,0 - lbzu 0,1(3) - lbzu 4,1(3) - lbzu 5,1(3) + lbz 0,1(3) + lbz 4,2(3) + lbz 5,3(3) insrdi 0,4,8,48 - lbzu 4,1(3) + lbz 4,4(3) insrdi 0,5,8,40 - lbzu 5,1(3) + lbz 5,5(3) insrdi 0,4,8,32 - lbzu 4,1(3) + lbz 4,6(3) insrdi 0,5,8,24 - lbzu 5,1(3) + lbz 5,7(3) insrdi 0,4,8,16 - lbzu 4,1(3) + lbzu 4,8(3) insrdi 0,5,8,8 insrdi 0,4,8,0 blr @@ -579,21 +579,21 @@ SHA3_squeeze: cmpldi 30,8 blt .Lsqueeze_tail - stbu 0,1(29) + stb 0,1(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,2(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,3(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,4(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,5(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,6(29) srdi 0,0,8 - stbu 0,1(29) + stb 0,7(29) srdi 0,0,8 - stbu 0,1(29) + stbu 0,8(29) subic. 30,30,8 beq .Lsqueeze_done diff --git a/sys/crypto/openssl/powerpc64le/poly1305-ppc.S b/sys/crypto/openssl/powerpc64le/poly1305-ppc.S index 7ffdb4a3d39b1..69862b94b2c3b 100644 --- a/sys/crypto/openssl/powerpc64le/poly1305-ppc.S +++ b/sys/crypto/openssl/powerpc64le/poly1305-ppc.S @@ -12,6 +12,7 @@ poly1305_init_int: std 0,0(3) std 0,8(3) std 0,16(3) + stw 0,24(3) cmpld 4,0 beq- .Lno_key @@ -41,6 +42,7 @@ poly1305_init_int: poly1305_blocks: .localentry poly1305_blocks,0 +.Lpoly1305_blocks: srdi. 5,5,4 beq- .Labort @@ -124,39 +126,1003 @@ poly1305_blocks: .long 0 .byte 0,12,4,1,0x80,5,4,0 .size poly1305_blocks,.-poly1305_blocks - .globl poly1305_emit .type poly1305_emit,@function -.align 4 +.align 5 poly1305_emit: .localentry poly1305_emit,0 - ld 7,0(3) - ld 8,8(3) - ld 9,16(3) - ld 6,0(5) - ld 5,8(5) + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + lwz 0,24(3) + + sldi 8,8,26 + sldi 12,9,52 + srdi 9,9,12 + sldi 10,10,14 + add 7,7,8 + addc 7,7,12 + sldi 12,11,40 + srdi 11,11,24 + adde 8,9,10 + addc 8,8,12 + addze 9,11 + + ld 10,0(3) + ld 11,8(3) + ld 12,16(3) + + neg 0,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 + and 7,7,0 + and 8,8,0 + and 9,9,0 + xor 7,7,10 + xor 8,8,11 + xor 9,9,12 addic 10,7,5 addze 11,8 addze 12,9 - srdi 0,12,2 - neg 0,0 + srdi 12,12,2 + neg 12,12 - andc 7,7,0 - and 10,10,0 - andc 8,8,0 - and 11,11,0 + andc 7,7,12 + and 10,10,12 + andc 8,8,12 + and 11,11,12 or 7,7,10 or 8,8,11 - addc 7,7,6 - adde 8,8,5 - std 7,0(4) - std 8,8(4) + + lwz 12,4(5) + lwz 9,12(5) + lwz 10,0(5) + lwz 11,8(5) + + insrdi 10,12,32,0 + insrdi 11,9,32,0 + + addc 7,7,10 + adde 8,8,11 + + addi 3,4,-1 + addi 4,4,7 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + srdi 7,7,8 + stbu 8,1(4) + srdi 8,8,8 + + stbu 7,1(3) + stbu 8,1(4) + blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .size poly1305_emit,.-poly1305_emit -.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl poly1305_blocks_vsx +.type poly1305_blocks_vsx,@function +.align 5 +poly1305_blocks_vsx: +.localentry poly1305_blocks_vsx,0 + + lwz 7,24(3) + cmpldi 5,128 + bge __poly1305_blocks_vsx + + neg 0,7 + lwz 7,0(3) + lwz 8,4(3) + lwz 9,8(3) + lwz 10,12(3) + lwz 11,16(3) + + sldi 8,8,26 + sldi 12,9,52 + add 7,7,8 + srdi 9,9,12 + sldi 10,10,14 + addc 7,7,12 + sldi 8,11,40 + adde 9,9,10 + srdi 11,11,24 + addc 9,9,8 + addze 11,11 + + ld 8,0(3) + ld 10,8(3) + ld 12,16(3) + + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + and 7,7,0 + and 9,9,0 + and 11,11,0 + xor 7,7,8 + xor 9,9,10 + xor 11,11,12 + + li 0,0 + std 7,0(3) + std 9,8(3) + std 11,16(3) + stw 0,24(3) + + b .Lpoly1305_blocks +.long 0 +.byte 0,12,0x14,0,0,0,4,0 +.size poly1305_blocks_vsx,.-poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + mulld 9,6,27 + mulhdu 10,6,27 + + mulld 30,7,29 + mulhdu 31,7,29 + addc 9,9,30 + adde 10,10,31 + + mulld 30,6,28 + mulhdu 11,6,28 + addc 10,10,30 + addze 11,11 + + mulld 30,7,27 + mulhdu 31,7,27 + addc 10,10,30 + adde 11,11,31 + + mulld 30,8,29 + mulld 31,8,27 + addc 10,10,30 + adde 11,11,31 + + andc 30,11,0 + and 8,11,0 + srdi 31,30,2 + add 30,30,31 + addc 6,9,30 + addze 7,10 + addze 8,8 + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_splat: + rldicl 9,6,0,38 + rldicl 10,6,38,38 + stw 9,0x00(31) + + rldicl 11,6,12,52 + slwi 9,10,2 + stw 10,0x10(31) + add 9,9,10 + stw 9,0x20(31) + + insrdi 11,7,14,38 + slwi 9,11,2 + stw 11,0x30(31) + add 9,9,11 + stw 9,0x40(31) + + rldicl 10,7,50,38 + rldicl 11,7,24,40 + slwi 9,10,2 + stw 10,0x50(31) + add 9,9,10 + stw 9,0x60(31) + + insrdi 11,8,3,37 + slwi 9,11,2 + stw 11,0x70(31) + add 9,9,11 + stw 9,0x80(31) + + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_splat,.-__poly1305_splat + +.align 5 +__poly1305_blocks_vsx: + stdu 1,-432(1) + mflr 0 + li 10,191 + li 11,207 + li 12,-1 + stvx 20,10,1 + addi 10,10,32 + stvx 21,11,1 + addi 11,11,32 + stvx 22,10,1 + addi 10,10,32 + stvx 23,10,1 + addi 10,10,32 + stvx 24,11,1 + addi 11,11,32 + stvx 25,10,1 + addi 10,10,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + stw 12,388(1) + li 12,-1 + or 12,12,12 + std 27,392(1) + std 28,400(1) + std 29,408(1) + std 30,416(1) + std 31,424(1) + std 0,448(1) + + bl .LPICmeup + + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + .long 0x7FA06699 + .long 0x7F3B6699 + .long 0x7F7C6699 + .long 0x7FFD6699 + .long 0x7FDE6699 + + cmplwi 7,0 + bne .Lskip_init_vsx + + ld 27,32(3) + ld 28,40(3) + srdi 29,28,2 + li 0,3 + add 29,29,28 + + mr 6,27 + mr 7,28 + li 8,0 + addi 31,3,60 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,52 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,56 + bl __poly1305_splat + + bl __poly1305_mul + addi 31,3,48 + bl __poly1305_splat + + ld 6,0(3) + ld 7,8(3) + ld 8,16(3) + + rldicl 9,6,0,38 + rldicl 10,6,38,38 + rldicl 11,6,12,52 + .long 0x7C0901E7 + insrdi 11,7,14,38 + .long 0x7C2A01E7 + rldicl 10,7,50,38 + .long 0x7C4B01E7 + rldicl 11,7,24,40 + .long 0x7C6A01E7 + insrdi 11,8,3,37 + .long 0x7C8B01E7 + li 0,1 + stw 0,24(3) + b .Loaded_vsx + +.align 4 +.Lskip_init_vsx: + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001819 + .long 0x7C3B1819 + .long 0x7C5C1819 + .long 0x7C7D1819 + .long 0x7C9E1819 + +.Loaded_vsx: + li 27,0x10 + li 28,0x20 + li 29,0x30 + li 30,0x40 + li 31,0x50 + li 7,0x60 + li 8,0x70 + addi 10,3,64 + addi 11,1,63 + + vxor 20,20,20 + .long 0xF000A057 + .long 0xF021A057 + .long 0xF042A057 + .long 0xF063A057 + .long 0xF084A057 + + + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + + + + + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + vspltisb 28,14 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x1295CEC4 + .long 0x12D6D6C4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + vand 21,21,29 + vand 20,20,29 + vand 22,22,29 + vand 23,23,29 + + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10D4368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x7D5D1A99 + .long 0x7D605299 + .long 0x7D9B5299 + .long 0x7DBC5299 + .long 0x7DDD5299 + .long 0x7EBE5299 + .long 0x7EDF5299 + .long 0x7EE75299 + .long 0x7F085299 + stvx 11,0,11 + stvx 12,27,11 + stvx 13,28,11 + stvx 14,29,11 + stvx 21,30,11 + stvx 22,31,11 + stvx 23,7,11 + stvx 24,8,11 + + addi 4,4,0x40 + addi 12,12,0x50 + addi 0,5,-64 + srdi 0,0,6 + mtctr 0 + b .Loop_vsx + +.align 4 +.Loop_vsx: + + + + + + + + + + + + + + + .long 0x11E55288 + .long 0x12055A88 + .long 0x12256A88 + .long 0x12466A88 + + .long 0x12865288 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12676A88 + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12875A88 + .long 0x1252A0C0 + lvx 12,31,11 + .long 0x12885A88 + .long 0x1273A0C0 + lvx 11,30,11 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12875288 + .long 0x1231A0C0 + .long 0x12885288 + .long 0x1252A0C0 + lvx 14,8,11 + .long 0x12895288 + .long 0x1273A0C0 + lvx 13,7,11 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + + + + + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0xF2B7C057 + vperm 22,23,24,31 + .long 0xF2F7C357 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + lvx 12,27,11 + .long 0x12815888 + .long 0x1273A0C0 + lvx 11,0,11 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + lvx 14,29,11 + .long 0x12806888 + .long 0x1273A0C0 + lvx 13,28,11 + + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12D6D6C4 + .long 0x1355CEC4 + .long 0x1317DEC4 + .long 0x12F7E6C4 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + vand 21,21,29 + vand 26,26,29 + vand 22,22,29 + vand 23,23,29 + + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x11384E8C + .long 0x10B52E8C + .long 0x10DA368C + .long 0x10F63E8C + .long 0x1117468C + vor 9,9,30 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + addi 4,4,0x40 + bdnz .Loop_vsx + + neg 5,5 + andi. 5,5,0x30 + sub 4,4,5 + + .long 0x7D5D1E99 + .long 0x7D605699 + .long 0x7D9B5699 + .long 0x7DBC5699 + .long 0x7DDD5699 + +.Last_vsx: + .long 0x11E55288 + .long 0x12065288 + .long 0x12275288 + .long 0x12485288 + .long 0x12695288 + + .long 0x12896288 + .long 0x11EFA0C0 + .long 0x12855A88 + .long 0x1210A0C0 + .long 0x12865A88 + .long 0x1231A0C0 + .long 0x12875A88 + .long 0x1252A0C0 + .long 0x7D9F5699 + .long 0x12885A88 + .long 0x1273A0C0 + .long 0x7D7E5699 + + .long 0x104238C0 + .long 0x100028C0 + .long 0x106340C0 + .long 0x102130C0 + .long 0x108448C0 + + .long 0x12887288 + .long 0x11EFA0C0 + .long 0x12897288 + .long 0x1210A0C0 + .long 0x12856A88 + .long 0x1231A0C0 + .long 0x12866A88 + .long 0x1252A0C0 + .long 0x7DC85699 + .long 0x12876A88 + .long 0x1273A0C0 + .long 0x7DA75699 + + .long 0x12876288 + .long 0x11EFA0C0 + .long 0x12886288 + .long 0x1210A0C0 + .long 0x12896288 + .long 0x1231A0C0 + .long 0x12855A88 + .long 0x1252A0C0 + .long 0x12865A88 + .long 0x1273A0C0 + + .long 0x12867288 + .long 0x11EFA0C0 + .long 0x12877288 + .long 0x1210A0C0 + .long 0x12887288 + .long 0x1231A0C0 + .long 0x12897288 + .long 0x1252A0C0 + .long 0x12856A88 + .long 0x1273A0C0 + + + .long 0x12805088 + .long 0x11EFA0C0 + .long 0x12815088 + .long 0x1210A0C0 + .long 0x12825088 + .long 0x1231A0C0 + .long 0x12835088 + .long 0x1252A0C0 + .long 0x12845088 + .long 0x1273A0C0 + + .long 0x12826088 + .long 0x11EFA0C0 + .long 0x12836088 + .long 0x1210A0C0 + .long 0x12846088 + .long 0x1231A0C0 + .long 0x12805888 + .long 0x1252A0C0 + .long 0x7D9B5699 + .long 0x12815888 + .long 0x1273A0C0 + .long 0x7D605699 + + .long 0x12817088 + .long 0x11EFA0C0 + .long 0x12827088 + .long 0x1210A0C0 + .long 0x12837088 + .long 0x1231A0C0 + .long 0x12847088 + .long 0x1252A0C0 + .long 0x7DDD5699 + .long 0x12806888 + .long 0x1273A0C0 + .long 0x7DBC5699 + + .long 0x12846088 + .long 0x11EFA0C0 + .long 0x12805888 + .long 0x1210A0C0 + .long 0x12815888 + .long 0x1231A0C0 + .long 0x12825888 + .long 0x1252A0C0 + .long 0x12835888 + .long 0x1273A0C0 + + .long 0x12837088 + .long 0x11EFA0C0 + .long 0x12847088 + .long 0x1210A0C0 + .long 0x12806888 + .long 0x1231A0C0 + .long 0x12816888 + .long 0x1252A0C0 + .long 0x12826888 + .long 0x1273A0C0 + + + + + .long 0xF00F7A57 + .long 0xF0308257 + .long 0xF0518A57 + .long 0xF0729257 + .long 0xF0939A57 + .long 0x11EF00C0 + .long 0x121008C0 + .long 0x123110C0 + .long 0x125218C0 + .long 0x127320C0 + + + + + vspltisb 20,2 + .long 0x1092CEC4 + .long 0x102FCEC4 + vand 3,18,29 + vand 0,15,29 + .long 0x108498C0 + .long 0x102180C0 + + .long 0x1264CEC4 + .long 0x1201CEC4 + vand 4,4,29 + vand 1,1,29 + .long 0x100098C0 + .long 0x105180C0 + + .long 0x1273A5C4 + .long 0x1222CEC4 + vand 2,2,29 + .long 0x100098C0 + .long 0x106388C0 + + .long 0x11E0CEC4 + .long 0x1243CEC4 + vand 0,0,29 + vand 3,3,29 + .long 0x102178C0 + .long 0x108490C0 + + beq .Ldone_vsx + + add 6,12,5 + + + .long 0x7EA02699 + .long 0x7EDB2699 + .long 0x7EFC2699 + .long 0x7F1D2699 + + + + + + .long 0xF0B5B057 + vspltisb 26,4 + vperm 7,21,22,31 + .long 0xF115B357 + + .long 0x10C5CEC4 + .long 0x10E7D6C4 + .long 0x1128DEC4 + .long 0x1108E6C4 + vand 5,5,29 + vand 6,6,29 + vand 7,7,29 + vand 8,8,29 + + .long 0xF297C057 + vperm 21,23,24,31 + .long 0xF2D7C357 + + .long 0x7DE03699 + .long 0x7E1D3699 + + .long 0x12F4CEC4 + .long 0x12B5D6C4 + .long 0x1316DEC4 + .long 0x12D6E6C4 + vand 20,20,29 + vand 23,23,29 + vand 21,21,29 + vand 22,22,29 + + + .long 0x11384E8C + .long 0x10B42E8C + .long 0x10D7368C + .long 0x10F53E8C + .long 0x1116468C + vor 9,9,30 + + vperm 0,0,0,15 + vand 5,5, 16 + vperm 1,1,1,15 + vand 6,6, 16 + vperm 2,2,2,15 + vand 7,7, 16 + vperm 3,3,3,15 + vand 8,8, 16 + vperm 4,4,4,15 + vand 9,9, 16 + + .long 0x10A500C0 + vxor 0,0,0 + .long 0x10C608C0 + vxor 1,1,1 + .long 0x10E710C0 + vxor 2,2,2 + .long 0x110818C0 + vxor 3,3,3 + .long 0x112920C0 + vxor 4,4,4 + + xor. 5,5,5 + b .Last_vsx + +.align 4 +.Ldone_vsx: + ld 0,448(1) + li 27,4 + li 28,8 + li 29,12 + li 30,16 + .long 0x7C001919 + .long 0x7C3B1919 + .long 0x7C5C1919 + .long 0x7C7D1919 + .long 0x7C9E1919 + + lwz 12,388(1) + mtlr 0 + li 10,191 + li 11,207 + or 12,12,12 + lvx 20,10,1 + addi 10,10,32 + lvx 21,10,1 + addi 10,10,32 + lvx 22,11,1 + addi 11,11,32 + lvx 23,10,1 + addi 10,10,32 + lvx 24,11,1 + addi 11,11,32 + lvx 25,10,1 + addi 10,10,32 + lvx 26,11,1 + addi 11,11,32 + lvx 27,10,1 + addi 10,10,32 + lvx 28,11,1 + addi 11,11,32 + lvx 29,10,1 + addi 10,10,32 + lvx 30,11,1 + lvx 31,10,1 + ld 27,392(1) + ld 28,400(1) + ld 29,408(1) + ld 30,416(1) + ld 31,424(1) + addi 1,1,432 + blr +.long 0 +.byte 0,12,0x04,1,0x80,5,4,0 +.long 0 +.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx + +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 12 + addi 12,12,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 + +.long 0x03ffffff,0x00000000 +.long 0x03ffffff,0x00000000 +.long 0x0000001a,0x00000000 +.long 0x0000001a,0x00000000 +.long 0x00000028,0x00000000 +.long 0x00000028,0x00000000 +.long 0x0e0f0001,0x00000000 +.long 0x1e1f1011,0x00000000 +.long 0x01000000,0x01000000 +.long 0x01000000,0x01000000 +.long 0x03020100,0x07060504 +.long 0x0b0a0908,0x0f0e0d0c + +.long 0x00000000,0x00000000 +.long 0x04050607,0x00000000 +.long 0x00000000,0x04050607 +.long 0x00000000,0x00000000 +.long 0x00000000,0x00000000 +.long 0x00000000,0x04050607 + +.long 0x00000000,0xffffffff +.long 0xffffffff,0xffffffff +.long 0x00000000,0xffffffff +.long 0x00000000,0xffffffff +.long 0x00000000,0x00000000 +.long 0x00000000,0xffffffff +.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,80,80,67,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 diff --git a/sys/crypto/openssl/powerpc64le/vpaes-ppc.S b/sys/crypto/openssl/powerpc64le/vpaes-ppc.S index 718232627e6d4..751f67c33c565 100644 --- a/sys/crypto/openssl/powerpc64le/vpaes-ppc.S +++ b/sys/crypto/openssl/powerpc64le/vpaes-ppc.S @@ -674,7 +674,7 @@ vpaes_cbc_encrypt: vor 24,0,0 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -726,7 +726,7 @@ vpaes_cbc_encrypt: vor 24,25,25 sub. 30, 30, 0 vperm 0, 0, 0, 29 - vsel 1, 28, 0, 30 + vsel 1,28,0,30 vor 28,0,0 stvx 1, 0, 4 addi 4, 4, 16 @@ -1044,7 +1044,7 @@ _vpaes_schedule_core: vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1066,7 +1066,7 @@ _vpaes_schedule_core: addi 9, 5, -15 vperm 0, 0, 0, 29 li 10, 4 - vsel 2, 28, 0, 30 + vsel 2,28,0,30 li 11, 8 stvx 2, 0, 5 li 12, 12 @@ -1157,7 +1157,7 @@ _vpaes_schedule_low_round: vsldoi 1, 7, 9, 16-12 vxor 7, 7, 1 - vspltisb 1, 0x0f + vspltisb 1,0x0f vsldoi 4, 7, 9, 16-8 @@ -1253,7 +1253,7 @@ _vpaes_schedule_mangle: vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr @@ -1304,7 +1304,7 @@ _vpaes_schedule_mangle: vperm 1, 3, 3, 29 - vsel 2, 28, 1, 30 + vsel 2,28,1,30 vor 28,1,1 stvx 2, 0, 5 blr