From f813af63d81b3a104bc0d73f587603e698f3f9e7 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Mon, 24 Feb 2025 17:34:56 +0000 Subject: [PATCH] Update enc_bootloader with latest aes.S (picotool 333d571c) CK_JITTER is removed as the enc_bootloader runs from XOSC not ROSC --- bootloaders/encrypted/aes.S | 681 ++++++++++++++++--------- bootloaders/encrypted/config.h | 46 +- bootloaders/encrypted/enc_bootloader.c | 53 +- 3 files changed, 488 insertions(+), 292 deletions(-) diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S index ad6c448d8..e0d653237 100644 --- a/bootloaders/encrypted/aes.S +++ b/bootloaders/encrypted/aes.S @@ -1,3 +1,13 @@ +/* MEMORY LAYOUT ASSUMPTIONS + +The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see +the macro getchaffaddress. + +The stack must be located at the end of Y scratch RAM: see the memory +wiping at the end of ctr_crypt_s where memory between the start of Y +scratch RAM and the stack pointer is overwritten. +*/ + .syntax unified .cpu cortex-m33 .thumb @@ -5,26 +15,24 @@ #include "config.h" #include "hardware/platform_defs.h" #include "hardware/regs/addressmap.h" +#include "hardware/regs/clocks.h" #include "hardware/regs/sha256.h" +#include "hardware/regs/resets.h" +#include "hardware/regs/rosc.h" +#include "hardware/regs/trng.h" #include "hardware/rcp.h" -.global gen_lut_sbox -.global ctr_crypt_s -.global remap -.global gen_rand_sha -.global init_key +.global decrypt +.global chaff -.global rkey_s -.global lut_a,lut_a_map -.global lut_b,lut_b_map -.global rstate_sha,rstate_lfsr +.extern lock_key @ RCP macros #define CTAG0 0x2a #define CTAG1 0x2b #define CTAG2 0x2c -#define CTAG3 0x2d @ not used +#define CTAG3 0x2d #define CTAG4 0x2e #define CTAG5 0x30 #define CTAG6 0x31 @@ -41,9 +49,13 @@ #define CTAG17 0x3c #define CTAG18 0x3d @ not used -.macro SET_COUNT n +@ number of blocks from the TRNG processed to initialise rstate_sha +#define TRNG_BLOCKS 25 + +@ The lower jitterpriorty is, the more the jitter +.macro SET_COUNT n,jitterpriority .if RC_COUNT -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_count_set \n .else rcp_count_set_nodelay \n @@ -51,9 +63,9 @@ .endif .endm -.macro CHK_COUNT n +.macro CHK_COUNT n,jitterpriority .if RC_COUNT -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_count_check \n .else rcp_count_check_nodelay \n @@ -61,9 +73,9 @@ .endif .endm -.macro GET_CANARY rx,tag +.macro GET_CANARY rx,tag,jitterpriority .if RC_CANARY -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_canary_get \rx,\tag .else rcp_canary_get_nodelay \rx,\tag @@ -71,9 +83,9 @@ .endif .endm -.macro CHK_CANARY rx,tag +.macro CHK_CANARY rx,tag,jitterpriority .if RC_CANARY -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_canary_check \rx,\tag .else rcp_canary_check_nodelay \rx,\tag @@ -81,18 +93,6 @@ .endif .endm -.macro GET_CANARY_NJ rx,tag @ with no jitter even if you ask for it (for situations where it would otherwise slow things down a lot) -.if RC_CANARY - rcp_canary_get_nodelay \rx,\tag -.endif -.endm - -.macro CHK_CANARY_NJ rx,tag @ with no jitter even if you ask for it -.if RC_CANARY - rcp_canary_check_nodelay \rx,\tag -.endif -.endm - .macro clear03 offset=0 getchaffaddress r0,\offset ldmia r0,{r0-r3} @@ -112,7 +112,9 @@ @ Put workspace in the second scratch area @ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants, @ otherwise they may end up silently replaced with 0 or 0xffffffff -.section .scratch_y.aes,"a",%progbits +.section .scratch_y.aes,"aw",%progbits + +workspace_start: @ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress @ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000) @@ -126,6 +128,37 @@ chaff: .space 48 +.balign 16 +rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words + @ see comment at init_key_4way for description of layout and meaning of rkey_s +.space 600 +rkey4way: @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space +.space 128 +.if CT_BPERM +bperm_rand: @ 32 half words that define the oblivious permutation of blocks +.space 64 +.endif + +.balign 16 +permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) +perm16: +.space 16 +@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s +.balign 16 +fourway: @ Must be 0 mod 16 +shareA: @ 0 mod 16 +.space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 +shareB: @ 4 mod 16 +.space 20 +shareC: @ 8 mod 16 +.space 4 +statevperm: @ 12 mod 16 +.space 4 @ vperm state rotation: only last two bits are operational; other bits random +RKshareC: @ Round key common share C; see comment at init_key_4way for explanation +.space 4 +RKshareCchange: @ Temporary used by ref_roundkey_share_s +.space 4 + @ Regardless of configuration, the code uses a single 256-entry LUT, @ which is a simple S-box table. @ The LUT is represented as two shares, lut_a and lut_b, @@ -143,7 +176,22 @@ chaff: @ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁ .balign 16 lut_a: @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup) -.space 256 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 lut_a_map: @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b .space 4 .space 4 @ align to 8 mod 16 @@ -152,38 +200,17 @@ lut_b: @ LUT share B (must be 8 mod 16 so that init_key_sb lut_b_map: .space 4 .space 4 @ align to multiple of 8 -rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words - @ every fourth word has a word that is used as a vperm count, and also as a spacer to misalign the shares mod 16 -.space 600 -rkey4way: @ scratch area for init_key; could overlap this with other scratch space if need to save space -.space 128 -.if CT_BPERM -bperm_rand: @ 32 half words that define the oblivious permutation of blocks -.space 64 -.endif + .balign 16 +rstate_all_start: @ Mark start of RNG data to allow selective memory wipe rstate_sha: @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero .space 16 -rstate_lfsr: @ 32-bit LFSR random state and constant used to step it (initialised by C program) -.space 8 -.balign 16 -permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) -perm16: -.space 16 -@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s -.balign 16 -fourway: @ Must be 0 mod 16 -shareA: @ 0 mod 16 -.space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 -shareB: @ 4 mod 16 -.space 20 -shareC: @ 8 mod 16 +jstate: @ 32-bit jitter state .space 4 -statevperm: @ 12 mod 16 -.space 4 @ vperm state rotation: only last two bits are operational; other bits random -RKshareC: +rstate_lfsr: @ 32-bit LFSR random state and constant used to step it .space 4 -.balign 16 +.word 0x1d872b41 @ constant that defines a maximal-length LFSR +rstate_all_end: @ Mark end of RNG data to allow selective memory wipe .if CT_BPERM .balign 16 @@ -195,7 +222,88 @@ murmur3_constants: @ Five constants used in murmur3_32 hash .word 0xc2b2ae35 .endif -@ Put main code in first scratch area +scratch_y_end: + +@ Initialisation code in main .text section +.section .text,"ax",%progbits + +@ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments. +@ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some +@ random numbers. +@ Trashes r0-r6 +.balign 4 +init_rstate: + CHK_COUNT 24,6 + ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET + ldr r5,=SHA256_BASE + movs r1,#1 + str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] + ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] @ reads as 0 + movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS @ initialise SHA internal state by writing START bit + str r1,[r5,#SHA256_CSR_OFFSET] + str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET -TRNG_RNG_IMR_OFFSET] + movs r6,#TRNG_BLOCKS*2+1 @ odd so that we break out of the loop half-way through loading the SHA hardware, giving + @ time for previous SHA computation to complete +2: + movs r1,#0xff @ TRNG setup is inside loop in case it is skipped. + str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET] @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples + str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] @ start ROSC if it is not already started + str r1,[r4,#TRNG_RNG_ICR_OFFSET -TRNG_RNG_IMR_OFFSET] @ clear all interrupts (including EHR_VLD) + adds r0,r4,#TRNG_EHR_DATA0_OFFSET -TRNG_RNG_IMR_OFFSET + movs r2,#TRNG_TRNG_BUSY_OFFSET -TRNG_RNG_IMR_OFFSET +1: + ldr r1,[r4,r2] @ wait for 192 ROSC samples to fill EHR,should take constant time + cmp r1,#0 + bne 1b + subs r6,#1 @ done? + beq 3f + movs r1,#8 +1: + ldmia r0!,{r2} @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1) + str r2,[r5,#SHA256_WDATA_OFFSET] @ for a total of half a SHA-256 block + subs r1,#1 + bne 1b + ldr r2,[r5,#SHA256_SUM0_OFFSET] @ TRNG is now sampling again; use some SHA bits to modulate the chain length + str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] + b.n 2b + +3: + CHK_COUNT 25,6 + str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] @ turn off rand source and wipe SHA bits left in TRNG config; r1=0 + str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] + adds r5,r5,#SHA256_SUM0_OFFSET + ldmia r5!,{r0-r3} + ldr r5,=rstate_sha + stmia r5,{r0-r3} + CHK_COUNT 26,6 + +@ r5=rstate_sha + movs r0,#0 + strb r0,[r5] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" +@ try to find a non-zero initialiser to create a non-degenerate LFSR + ldr r1,[r5,#4] + cbnz r1,1f @ is word 1 non-zero? then use it + ldr r1,[r5,#8] + cbnz r1,1f @ otherwise, is word 2 non-zero? use it + ldr r1,[r5,#12] + cbnz r1,1f @ otherwise, is word 3 non-zero? use it + mov r1,r5 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-96} probability) +1: + str r1,[r5,#rstate_lfsr-rstate_sha] + ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE + str r1,[r2,#0] + CHK_COUNT 27,6 +.if GEN_RAND_SHA +.if SH_JITTER + movs r2,#0 + str r2,[r5,#jstate-rstate_sha] +.endif +.endif + + CHK_COUNT 28,6 + bx r14 + +@ Put AES core code in first scratch area .section .scratch_x.aes,"ax",%progbits .if GEN_RAND_SHA @@ -209,11 +317,26 @@ murmur3_constants: @ Five constants used in murmur3_32 hash .balign 4 gen_rand_sha: push {r14} - GET_CANARY_NJ r14,CTAG1 + GET_CANARY r14,CTAG1,2 push {r1-r3,r14} +.if SH_JITTER + ldr r2,=rstate_sha + ldr r0,[r2,#jstate-rstate_sha] + movs r1,#1 + movs r3,r0,lsl#2 + ands r3,r3,#31 + movs r3,r1,lsl r3 @ 1<<(4*(r0&7)) + udiv r3,r3,r1 @ Takes constant + (r0&7) cycles + lsrs r0,r0,#1 + bne 1f + bl gen_rand_sha_nonpres + ldr r2,=rstate_sha +1: + str r0,[r2,#jstate-rstate_sha] +.endif bl gen_rand_sha_nonpres pop {r1-r3,r14} - CHK_CANARY_NJ r14,CTAG1 + CHK_CANARY r14,CTAG1,0 pop {r15} @ Return single random word in r0 @@ -273,11 +396,11 @@ gen_rand_sha_nonpres: gen_rand_sha: gen_rand_lfsr: @ Not used push {r14} - GET_CANARY_NJ r14,CTAG2 + GET_CANARY r14,CTAG2,2 push {r1,r2,r14} bl gen_rand_lfsr_nonpres pop {r1,r2,r14} - CHK_CANARY_NJ r14,CTAG2 + CHK_CANARY r14,CTAG2,0 pop {r15} .endif @@ -311,6 +434,56 @@ gen_rand_lfsr_nonpres: .ltorg +.balign 4 +.thumb_func +decrypt: + push {r14} + GET_CANARY r14,CTAG3,6 + SET_COUNT 23,6 + push {r0-r12,r14} + bl reset_sha_trng + bl init_rstate +@ randomly re-share the LUT contents + ldr r4,=lut_a + mov r5,#64 @ 64 words = 256 bytes +1: + bl gen_rand_sha_nonpres + ldr r6,[r4,#lut_b-lut_a] @ EOR a random word into both shares + eors r6,r6,r0 + str r6,[r4,#lut_b-lut_a] + ldr r6,[r4] + eors r6,r6,r0 + stmia r4!,{r6} + subs r5,r5,#1 + bne 1b + CHK_COUNT 29,6 + bl remap @ scramble the LUTs + pop {r0} @ pointer to 4way key data + CHK_COUNT 30,6 + bl init_key_4way + CHK_COUNT 31,6 + bl lock_key + pop {r0-r2} + bl ctr_crypt_s + bl randomisechaff + clear03 + pop {r4-r12,r14} + CHK_CANARY r14,CTAG3,6 + pop {r15} + +.balign 4 +.thumb_func +reset_sha_trng: + ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET + ldr r2,[r1] + ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS + orrs r2,r2,r3 + str r2,[r1] @ reset the SHA hardware and the TRNG hardware + CHK_COUNT 23,6 + bics r2,r2,r3 + str r2,[r1] @ release the reset + bx r14 + .balign 4 .thumb_func makesmallperm: @@ -321,7 +494,7 @@ makesmallperm: @ Trashes r0-r3 push {r14} - GET_CANARY_NJ r14,CTAG4 + GET_CANARY r14,CTAG4,6 push {r4-r6,r14} movs r4,r1 movs r6,r0 @@ -354,7 +527,7 @@ makesmallperm: 2: pop {r4-r6,r14} - CHK_CANARY_NJ r14,CTAG4 + CHK_CANARY r14,CTAG4,6 pop {r15} .balign 4 @@ -365,7 +538,7 @@ makeperm16: @ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha @ Trashes r0-r5 - GET_CANARY r0,CTAG5 + GET_CANARY r0,CTAG5,1 push {r0,r14} ldr r4,=perm16 bl gen_rand_sha_nonpres @@ -421,7 +594,7 @@ makeperm16: bne 1b pop {r0,r14} - CHK_CANARY r0,CTAG5 + CHK_CANARY r0,CTAG5,4 bx r14 .balign 4 @@ -429,7 +602,7 @@ makeperm16: remap: @ do a random remap of the LUTs @ preserves r0-r11; trashes r12 - GET_CANARY r12,CTAG6 + GET_CANARY r12,CTAG6,6 push {r0-r12,r14} bl gen_rand_sha_nonpres ldr r1,=lut_a @@ -438,15 +611,14 @@ remap: ldr r1,=lut_b bl remap_1 pop {r0-r12,r14} - CHK_CANARY r12,CTAG6 + CHK_CANARY r12,CTAG6,6 bx r14 - remap_1: @ r0: B0:xa B1:xb B2:ya B3:yb @ r1: array of 256 bytes, followed by a 4-byte map @ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0 - GET_CANARY_NJ r6,CTAG7 + GET_CANARY r6,CTAG7,6 push {r6,r14} mov r14,0x01010101 ubfx r6,r0,#16,#8 @@ -491,7 +663,7 @@ remap_1: subs r2,r2,#4 bpl 1b pop {r6,r14} - CHK_CANARY_NJ r6,CTAG7 + CHK_CANARY r6,CTAG7,6 bx r14 .if RK_ROR @@ -511,7 +683,7 @@ ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rou ldr r4,=rkey_s loadlfsr steplfsr @ r0=change in RKshareC - adr r2,RKshareCchange + ldr r2,=RKshareCchange str r0,[r2] ldr r3,=RKshareC ldr r5,[r3] @@ -535,7 +707,8 @@ ref_roundkey_shares_s_loop: steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] - ldr r3,RKshareCchange + ldr r3,=RKshareCchange + ldr r3,[r3] movs r2,#0 usub8 r10,r2,r10 ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2 @@ -554,9 +727,6 @@ ref_roundkey_shares_s_loop: clear03 24 ref_roundkey_shares_s_exit: bx r14 - .balign 4 -RKshareCchange: - .space 4 .balign 4 .thumb_func @@ -570,7 +740,7 @@ RKshareCchange: ref_roundkey_hvperms_s: movs r7,#30 ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r10,CTAG9 + GET_CANARY r10,CTAG9,6 push {r10,r14} ldr r10,=rkey_s ref_roundkey_hvperms_s_loop: @@ -592,7 +762,7 @@ ref_roundkey_hvperms_s_loop: clear03 28 ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code pop {r10,r14} - CHK_CANARY r10,CTAG9 + CHK_CANARY r10,CTAG9,6 bx r14 .else @@ -604,7 +774,7 @@ ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to ana ref_roundkey_shares_s: mov r11,#15 @ there are 15 expanded keys ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds - GET_CANARY r4,CTAG8 + GET_CANARY r4,CTAG8,6 push {r4,r14} ldr r4,=rkey_s loadlfsr @@ -641,7 +811,7 @@ ref_roundkey_shares_s_loop: clear03 24 ref_roundkey_shares_s_exit: pop {r4,r14} - CHK_CANARY r4,CTAG8 + CHK_CANARY r4,CTAG8,6 bx r14 .balign 4 @@ -651,7 +821,7 @@ ref_roundkey_shares_s_exit: ref_roundkey_hvperms_s: movs r7,#30 ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r0,CTAG9 + GET_CANARY r0,CTAG9,6 push {r0,r14} bl gen_rand_lfsr_nonpres ldr r1,=rkey_s @@ -679,11 +849,13 @@ ref_roundkey_hvperms_s_loop: clear03 28 ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code pop {r0,r14} - CHK_CANARY r0,CTAG9 + CHK_CANARY r0,CTAG9,6 bx r14 .endif +.ltorg + .if ST_VPERM .balign 4 .thumb_func @@ -733,7 +905,7 @@ addstatevperm_exit: @ label exit point to be to able to specify to ana @ Trashes r0-r3,r12 .balign 4 ns_to_s: - GET_CANARY r12,CTAG11 + GET_CANARY r12,CTAG11,6 push {r12,r14} .if ST_SHAREC bl gen_rand_sha_nonpres @ Create state share C; all bytes the same @@ -765,7 +937,7 @@ ns_to_s: bl addstatevperm @ Initialise state vperm with SHA RNG, refresh with LFSR RNG .endif pop {r12,r14} - CHK_CANARY r12,CTAG11 + CHK_CANARY r12,CTAG11,6 bx r14 @ Conjugate lut_a, lut_b with shareC @@ -863,8 +1035,6 @@ shift_rows_s: @ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1 .macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b -@ !!! can probably save some registers, e.g. allow trashing of r0x00, r0x1b -@ can possibly also simplify slightly with refactorisation uadd8 \rt,\rx,\rx @ field multiplication by 2 as above sel \rw,\r0x1b,\r0x00 eors \rt,\rt,\rw @ 2x @@ -904,51 +1074,6 @@ mix_cols_s: ldmia r12!,{r0,r1} @ overwrite sensitive shareB-related quantities r0,r1 with random numbers bx r14 -.balign 4 -.thumb_func -gen_lut_sbox: -@ gen_lut_sbox sets both lut_a and lut_b to the S-box table and -@ returns r0=lut_a+256, r1=lut_b+256 -@ first set lut_a to be a table of GF(2⁸) inverses, using lut_b as temporary storage - ldr r0,=lut_a - ldr r1,=lut_b -@ first set lut_a to be a table of antilogarithms, lut_b a table of logarithms - mov r2,#0 - strb r2,[r0] @ (*) - mov r3,#1 @ we maintain invariant that r2=log(r3) -1: - strb r2,[r0,r3] @ log table - strb r3,[r1,r2] @ antilog table - lsls r12,r3,#25 - it cs - eorcs r12,r12,#0x1b000000 @ multiply by x - eor r3,r3,r12,lsr#24 @ multiply by x+1 ("3"), which is a primitive element - add r2,r2,#1 - cmp r2,#255 - bls 1b - movs r2,#255 -1: - ldrb r3,[r0,r2] @ for each i≠0, find log,... - eor r3,r3,#255 @ ... negate... - ldrb r3,[r1,r3] @ ... and antilog to get inverse - strb r3,[r0,r2] - subs r2,r2,#1 - bne 1b @ note that inverse(0)=0 by (*) above -@ At this point r0=lut_a, r1=lut_b, lut_a[] contains inverses and lut_b[] contains other stuff - mov r12,#256 -1: - ldrb r2,[r0] - eors r3,r2,r2,lsl#1 @ convolve byte with 0x1f - eors r3,r3,r3,lsl#2 - eors r3,r3,r2,lsl#4 - eors r2,r3,r3,lsr#8 - eor r2,r2,#0x63 @ and add 0x63 - strb r2,[r0],#1 @ let lut_a[i]=sbox[i] - strb r2,[r1],#1 @ let lut_b[i]=sbox[i] - subs r12,r12,#1 - bne 1b - bx r14 - @ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups) .macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3 ubfx \Rspare0,\Rtarg,#0, #8 @@ -970,79 +1095,106 @@ gen_lut_sbox: .balign 4 .thumb_func map_sbox_s: - GET_CANARY r12,CTAG12 + GET_CANARY r12,CTAG12,3 push {r12,r14} ldr r0,=shareA @ Write out state share A to memory - stmia r0,{r4-r7} - clear03 @ barrier +@ stmia r0,{r4-r7} @ Used to do a STM + getchaffaddress r1 + ldr r2,[r1] + str r4,[r0] @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms, + str r2,[r1] @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired + str r5,[r0,#4] @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic. + str r2,[r1] @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1. + str r6,[r0,#8] @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but + str r2,[r1] @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic. + str r7,[r0,#12] + str r2,[r1] ldr r0,=shareB @ Write out state share B to memory - stmia r0,{r8-r11} - clear03 4 @ barrier + stmia r0,{r8-r11} @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with bl makeperm16 @ Rebuild random 16-way permutation. Maybe do this less frequently @ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation + bl gen_rand_sha_nonpres + mov r11,r0 ldr r8,=lut_a ldr r9,=lut_b ldr r0,[r8,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) - eors r10,r0,r0,lsr#8 - uxtb r10,r10 @ R10 = a0^a1 + eors r3,r0,r0,lsr#8 @ R3 = a0^a1 | junk + uxtb r10,r3 ldr r1,[r9,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) eors r1,r0,r1 eors r2,r1,r1,lsr#8 - uxtb r11,r2 @ R11 = a0^a1^b0^b1 movs r12,r1,lsr#16 @ R12 = c0^d0 | (c1^d1)<<8 + bfi r12,r2,#16,#8 @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 ldr r4,=perm16 ldr r5,=shareA ldr r6,=shareB -@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=a0^a1^b0^b1, r12=(c0^d0) | (c1^d1)<<8 + movs r1,#0;movs r2,#0;movs r3,#0 +@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 movs r0,#15 1: @ (Ordering instructions to minimise result delays) ldrb r1,[r4,r0] @ r1 = perm[r0] + mov r11,r11,ror#11 @ Rotate random 32 bits to present a new low 8 bits eors r7,r1,#2 @ r7 = perm[r0]^2 ldrb r2,[r5,r1] @ r2 = shareA[perm[r0]] + eor r11,r11,r2,ror#8 @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted) ldrb r3,[r6,r7] @ r3 = shareB[perm[r0]^2] - eors r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 + eor r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 eors r2,r2,r3 @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2] ldrb r3,[r8,r2] @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]] - eors r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) - eors r2,r2,r11 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] - strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 + eor r2,r2,r12,lsr#16 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] + eor r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) + eor r3,r3,r11 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8) + strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand ldrb r3,[r9,r2] @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]] subs r0,r0,#1 - eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1 - strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1 + eor r3,r3,r11 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand + eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8) + strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 bpl 1b clear03 8 @ barrier ldmia r6,{r8-r11} @ Read state share B back from memory clear03 12 @ barrier - ldmia r5,{r4-r7} @ Read state share A back from memory - clear03 16 @ barrier + getchaffaddress r0,16 + bfi r0,r5,#0,#4 @ match chaff pointer (r0) to share A location (R5) mod 16 + @ldmia r5,{r4-r7} @ Read state share A back from memory + @clear03 16 @ barrier + ldr r4,[r5] @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s + ldr r1,[r0] + ldr r6,[r5,#8] + ldr r1,[r0,#8] + ldr r7,[r5,#12] + ldr r1,[r0,#12] + ldr r5,[r5,#4] @ Do r5 last because it's the address register + ldr r1,[r0,#4] @ Refresh state shares because luts only give imperfect share-by-value - - loadlfsr - steplfsr; eors r4,r4,r0; mov r12,#0; eors r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc - steplfsr; eors r5,r5,r0; mov r12,#0; eors r9,r9,r0,ror#16 - steplfsr; eors r6,r6,r0; mov r12,#0; eors r10,r10,r0,ror#16 - steplfsr; eors r7,r7,r0; mov r12,#0; eors r11,r11,r0,ror#16 - savelfsr +@ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent) +@ loadlfsr +@ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc +@ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16 +@ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16 +@ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16 +@ savelfsr pop {r12,r14} - CHK_CANARY r12,CTAG12 + CHK_CANARY r12,CTAG12,5 bx r14 +.ltorg + .balign 4 .thumb_func randomisechaff: @ Randomise 48 bytes of chaff values (random load values) @ Uses 12 bytes of permscratch @ Trashes r0-3 - GET_CANARY r0,CTAG13 + GET_CANARY r0,CTAG13,6 push {r0,r14} movs r0,#12 ldr r1,=permscratch @@ -1059,19 +1211,32 @@ randomisechaff: subs r1,r1,#1 bpl 1b pop {r0,r14} - CHK_CANARY r0,CTAG13 + CHK_CANARY r0,CTAG13,6 bx r14 .balign 4 -refreshchaff: +refreshchaff_and_lfsr: @ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff +@ Re-randomise LFSR with SHA @ Uses 12 bytes of permscratch @ Trashes r0-3,12 - GET_CANARY r0,CTAG14 + GET_CANARY r0,CTAG14,6 push {r0,r14} + +@ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence + bl gen_rand_sha_nonpres + ldr r1,=rstate_lfsr + ldr r2,[r1] + adds r2,r2,r0 + beq 1f @ Don't update LFSR state to 0 + str r2,[r1] +1: + +@ Choose a random order to update chaff words to make 2nd order attacks harder movs r0,#12 ldr r1,=permscratch - bl makesmallperm @ Update the random words in a random order to make 2nd order attacks harder + bl makesmallperm + movs r1,#11 1: push {r1} @@ -1086,7 +1251,7 @@ refreshchaff: subs r1,r1,#1 bpl 1b pop {r0,r14} - CHK_CANARY r0,CTAG14 + CHK_CANARY r0,CTAG14,6 bx r14 .balign 4 @@ -1094,7 +1259,7 @@ refreshchaff: @ Do sbox on the four bytes of the 4-way share r4-r7 @ Trashes r0,r8-r12 init_key_sbox: - GET_CANARY r12,CTAG15 + GET_CANARY r12,CTAG15,6 push {r1-r3,r12,r14} bl gen_rand_sha_nonpres; mov r8,r0 bl gen_rand_sha_nonpres; mov r9,r0 @@ -1113,16 +1278,16 @@ init_key_sbox: movs r0,#4 ldr r1,=permscratch bl makesmallperm @ Build random 4-way permutation determining order of bytes to be SBOXed - ldr r1,=permscratch @ Write out random addresses in advance to save two registers + ldr r1,=permscratch @ Write out random addresses in advance to save two registers (reusing permscratch) ldr r4,[r1] ldr r0,=fourway uxtab r5,r0,r4 uxtab r6,r0,r4,ror#8 uxtab r7,r0,r4,ror#16 uxtab r8,r0,r4,ror#24 - stmia r1,{r5-r8} @ Store fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] + stmia r1,{r5-r8} @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] - bl gen_rand_sha @ Save some randomness for the resharing operation later + bl gen_rand_sha @ Save some randomness for the resharing operation later movs r7,r0 bl gen_rand_sha movs r8,r0 @@ -1148,8 +1313,8 @@ init_key_sbox: ldr r6,[r1],#4 @ r6 = fourway + perm[i] (i=0-3, loop iteration) ldr r5,[r5] @ Random load to mask previous load - ands r9,r6,#12 @ r9 = chaff address aligned to r6 mod 16 - add r9,r11,r9 + ands r9,r6,#12 + add r9,r11,r9 @ r9 = chaff address aligned to (r6 bic 3) mod 16 ldrb r4,[r6,#0] ldr r14,[r9,#0] @ Random load to mask previous load eor r4,r4,r10 @@ -1171,7 +1336,7 @@ init_key_sbox: eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 ands r14,r4,#255 - ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] + ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] and r14,r4,#15 add r14,r14,#32 ldrb r14,[r11,r14] @ Random load to mask previous load (r2 and r11 are both 0 mod 16) @@ -1207,7 +1372,7 @@ init_key_sbox: ldmia r11,{r8-r12,r14} @ Random load to mask previous load and to obfuscate registers pop {r1-r3,r12,r14} - CHK_CANARY r12,CTAG15 + CHK_CANARY r12,CTAG15,6 bx r14 .balign 4 @@ -1221,7 +1386,7 @@ init_key_sbox: @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4]) @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16 storeroundkey: - GET_CANARY r8,CTAG16 + GET_CANARY r8,CTAG16,6 push {r2,r8,r14} @ eor two 4-way share components to make a component of a 2-way share @@ -1291,12 +1456,12 @@ storeroundkey: adds r3,r3,#4 @ Set r3 = (r3 on entry) + 40 pop {r2,r8,r14} - CHK_CANARY r8,CTAG16 + CHK_CANARY r8,CTAG16,6 bx r14 .balign 4 .thumb_func -init_key: +init_key_4way: @ On entry, r0 points to 4-way shared raw key data (128 bytes) @ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 @ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K. @@ -1312,17 +1477,21 @@ init_key: @ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16 @ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC - GET_CANARY r12,CTAG17 - push {r4-r11,r12,r14} + GET_CANARY r12,CTAG17,6 + push {r0-r12,r14} +@ Transfer 4-way key into local workspace, rerandomising the shares mov r5,r0 @ r5=4-way key input bl randomisechaff - ldr r4,=rkey4way - movs r6,#8 + ldr r6,=rkey4way + movs r7,#8 1: - ldmia r5!,{r0-r3} - stmia r4!,{r0-r3} - subs r6,r6,#1 + ldmia r5!,{r1-r4} + bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0 + bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0 + bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0 + stmia r6!,{r1-r4} + subs r7,r7,#1 bne 1b @ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for @@ -1400,10 +1569,12 @@ init_key_expandloop: cmp r2,#52 bne init_key_expandloop - pop {r4-r11,r12,r14} - CHK_CANARY r12,CTAG17 + pop {r0-r12,r14} + CHK_CANARY r12,CTAG17,6 bx r14 +.ltorg + @ Add the round key shares pointed to by r12 into the state shares @ Trashes r0-r3 .balign 4 @@ -1421,7 +1592,7 @@ addrkey_s: ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits ldr r2,[r0,#16] @ barrier load - rsbs r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot + rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot @ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot @ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr .if RK_ROR @@ -1444,7 +1615,7 @@ addrkey_s: bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits ldr r2,[r0,#16] @ barrier load - rsbs r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot + rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot ldr r3,=RKshareC @ r3=common round key shareC bfi r0,r3,#0,#4 ldr r3,[r3] @@ -1466,7 +1637,6 @@ addrkey_s: ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; eors r11,r11,r0 .endif clear03 - bx r14 .balign 4 @@ -1484,11 +1654,11 @@ addrkey_s: ctr_crypt_s: @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks - GET_CANARY r12,CTAG0 - push {r0,r4-r11,r12,r14} + GET_CANARY r12,CTAG0,6 + push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets push {r0-r2} - SET_COUNT 93 + SET_COUNT 93,6 .if CT_BPERM @ Initialise 32 random numbers (which fit in half-words) @@ -1505,10 +1675,10 @@ ctr_crypt_s: bl randomisechaff pop {r0-r2} movs r3,#0 - CHK_COUNT 93 + CHK_COUNT 93,6 ctr_crypt_mainloop: - SET_COUNT 80 + SET_COUNT 80,6 @ here r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) @@ -1517,7 +1687,7 @@ ctr_crypt_mainloop: tst r3,#(REFCHAFF_PERIOD-1) bne 1f - bl refreshchaff + bl refreshchaff_and_lfsr 1: ldr r3,[r13,#12] @ get block count off the stack @@ -1525,7 +1695,7 @@ ctr_crypt_mainloop: bne 1f bl remap @ shuffle the LUTs; this preserves R3 1: - CHK_COUNT 80 + CHK_COUNT 80,6 tst r3,#(REFROUNDKEYSHARES_PERIOD-1) bne 1f @@ -1538,7 +1708,8 @@ ctr_crypt_mainloop: bl ref_roundkey_hvperms_s @ refresh the round key vperms 1: - CHK_COUNT 81 + CHK_COUNT 81,6 + pop {r0-r3} @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @@ -1586,9 +1757,9 @@ ctr_crypt_mainloop: .else mov r12,r3 .endif - CHK_COUNT 82 + CHK_COUNT 82,6 -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered) push {r0-r3,r12} processIV: @ non-target label to assist power analysis @@ -1605,36 +1776,36 @@ processIV: @ non-target label to assist power analysis ldr r0,[r13] @ peek at stack to restore r0=IV ptr ldmia r0,{r4-r7} @ load IV clear03 @ barrier to remove traces of IV from internal CPU load registers - push {r0-r3} @ We want to randomise the internal memory registers associated with the above LDM load, but this - pop {r0-r3} @ may come from non-scratch memory and have its own internal registers, so we clear it using a - @ stack save/load. Either R13 is in non-scratch memory, in which case this works, or it isn't, in - @ which case it doesn't matter, because the only subsequent use of non-scratch memory is the stack. @ Add in r9 in byte-big-endian, bit-little-endian (!) fashion, while trying to avoid rev operations @ as far as possible as these tend to expose (via power fluctuations) byte-level hamming weights. -@ It's worth avoiding revs on r6, r5, r4, even at the cost of introducing a small timing dependency. - @ First do 128-bit addition of r9 to byte-reversed IV - rev r7,r7; adds r7,r7,r9; bcc 1f - rev r6,r6; adcs r6,r6,#0; rev r6,r6; bcc 1f - rev r5,r5; adcs r5,r5,#0; rev r5,r5; bcc 1f - rev r4,r4; adcs r4,r4,#0; rev r4,r4 + rev r7,r7 + cmn r7,#MAX_NUM_BLOCKS @ Compare against maximum number of blocks + bcs 1f + add r7,r7,r9 @ This can temporarily overflow but it doesn't matter as we know that r7+r12 does not overflow + sub r7,r7,r8 + b 2f 1: -@ At this point, r7 is reversed and r4-r6 are not + adds r7,r7,r9 + rev r6,r6; adcs r6,r6,#0 + rev r5,r5; adcs r5,r5,#0 + rev r4,r4; adcs r4,r4,#0 @ Now do 128-bit subtraction of r8 from byte-reversed IV - subs r7,r7,r8; rev r7,r7; bcs 1f - rev r6,r6; sbcs r6,r6,#0; rev r6,r6; bcs 1f - rev r5,r5; sbcs r5,r5,#0; rev r5,r5; bcs 1f - rev r4,r4; sbcs r4,r4,#0; rev r4,r4 -1: + subs r7,r7,r8 + sbcs r6,r6,#0; rev r6,r6 + sbcs r5,r5,#0; rev r5,r5 + sbcs r4,r4,#0; rev r4,r4 +2: + rev r7,r7 clear01 16 - CHK_COUNT 83 + CHK_COUNT 83,6 @ r4-r7 = IV for the current block bl ns_to_s @ convert IV+x to shares, which includes choosing and incorporating a random shareC - CHK_COUNT 84 + CHK_COUNT 84,6 bl conjshareC @ Add the effect of shareC to lut_a, lut_b - CHK_COUNT 85 + CHK_COUNT 85,6 @ now perform the 15 encryption rounds on (key, state=IV+x) @ here r4-r7, r8-r11: state mov r2,#0 @ round counter @@ -1647,7 +1818,7 @@ rounds_s_mainloop: bl map_sbox_s bl shift_rows_s .if ST_VPERM - ldmia r13,{r2} @ peek at stack to get round count + ldr r2,[r13] @ peek at stack to get round count cmp r2,#NUMREFSTATEVPERM bcs 1f bl gen_rand_lfsr_nonpres @@ -1664,12 +1835,12 @@ rounds_s_mainloop: pop {r2} b rounds_s_mainloop 2: - CHK_COUNT 86 + CHK_COUNT 86,6 ldr r12,=rkey_s+14*40 @ final round key shares bl addrkey_s - CHK_COUNT 87 + CHK_COUNT 87,6 bl conjshareC @ Undo the effect of shareC from lut_a, lut_b - CHK_COUNT 88 + CHK_COUNT 88,6 .if ST_VPERM @ Undo the effects of vperm rotation recorded in statevperm ldr r1,=statevperm @@ -1682,6 +1853,7 @@ rounds_s_mainloop: push {r0,r3} @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered +decryption_start: @ Decrypt ciphertext using AES output in shares: r4-r11 .if ST_SHAREC ldr r0,=shareC @@ -1689,38 +1861,79 @@ rounds_s_mainloop: .else movs r0,#0 .endif - CHK_COUNT 89 + ldr r14,=chaff +@ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff + CHK_COUNT 89,6 add r1,r1,r12,lsl#4 @ Temporarily r1 points to block-to-be-deciphered - ldr r3,[r1] - eors r3,r3,r4 - eors r3,r3,r8,ror#16 @ Now r4 and r8 are free - eors r3,r3,r0 - str r3,[r1] - ldr r3,[r1,#4] + ldr r3,[r1] @ r3=ciphertext word + eors r3,r3,r4 @ r3=r3^shareA + ldr r4,[r14] @ barrier load + eor r3,r3,r8,ror#16 @ r3=r3^shareB + eors r3,r3,r0 @ r3=r3^shareC + str r3,[r1] @ plaintext word=r3 + ldr r3,[r1,#4] @ and similarly for words 1,2,3 of block... + ldr r4,[r14,#4] eors r3,r3,r5 - eors r3,r3,r9,ror#16 + eor r3,r3,r9,ror#16 eors r3,r3,r0 str r3,[r1,#4] ldr r3,[r1,#8] + ldr r4,[r14,#8] eors r3,r3,r6 - eors r3,r3,r10,ror#16 + eor r3,r3,r10,ror#16 eors r3,r3,r0 str r3,[r1,#8] ldr r3,[r1,#12] + ldr r4,[r14,#12] eors r3,r3,r7 - eors r3,r3,r11,ror#16 + eor r3,r3,r11,ror#16 eors r3,r3,r0 str r3,[r1,#12] + sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer - CHK_COUNT 90 + CHK_COUNT 90,6 pop {r0,r3} @ Restore IV and block counter @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter +decryption_end: adds r3,r3,#1 cmp r3,r2 - CHK_COUNT 91 + CHK_COUNT 91,6 bne ctr_crypt_mainloop - pop {r0,r4-r11,r12,r14} - CHK_CANARY r12,CTAG0 + +#if WIPE_MEMORY +@ Wipe memory from workspace_start up to the stack pointer +@ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals + ldr r4,=workspace_start + ldr r5,=rstate_all_start +1: + bl gen_rand_sha_nonpres + stmia r4!,{r0} + cmp r4,r5 + bcc 1b + ldr r4,=rstate_all_end + mov r5,r13 @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register +1: + bl gen_rand_sha_nonpres + stmia r4!,{r0} + cmp r4,r5 + bcc 1b + +@ Then fill everything with zeros so as not to leave behind clues about the RNG state + ldr r4,=workspace_start + movs r0,#0 + mov r5,r13 +1: + stmia r4!,{r0} + cmp r4,r5 + bcc 1b +#endif + +.if GEN_RAND_SHA + SET_COUNT 23,6 + bl reset_sha_trng @ clear out the SHA hardware +.endif + pop {r0-r12,r14} + CHK_CANARY r12,CTAG0,6 bx r14 diff --git a/bootloaders/encrypted/config.h b/bootloaders/encrypted/config.h index dd0c9898e..2c4ce0d03 100644 --- a/bootloaders/encrypted/config.h +++ b/bootloaders/encrypted/config.h @@ -1,6 +1,6 @@ #pragma once -// These options should be enabled because the security risk of not using them is too high +// These options (up to long /////////////// line) should be enabled because the security risk of not using them is too high // or because the time cost is very low so you may as well have them. // They can be set to 0 for analysis or testing purposes. @@ -22,6 +22,10 @@ #define RK_ROR 1 // store round key shares with random rotations within each word #endif +#ifndef WIPE_MEMORY +#define WIPE_MEMORY 1 // Wipe memory after decryption +#endif + // The following options should be enabled to increase resistance to glitching attacks. #ifndef RC_CANARY @@ -31,34 +35,42 @@ #define RC_COUNT 1 // use rcp_count feature #endif -// Although enabling the following option likely has little theoretical benefit, in -// practice randomising the timing of operations can make side-channel attacks very -// much more effort to carry out. It can be disabled for analysis or testing purposes. +// Although jitter/timing-variation may be circumventable in theory, in practice +// randomising the timing of operations can make side-channel attacks very much more +// effort to carry out. These can be disabled for analysis or testing purposes. +// It is advisable to use a least one form of jitter. +// RC_JITTER is quite slow, and is probably the most predictable of the three, so it is disabled by default. +// (Leaving it as an option because it's just possible that the large delays it produces are advantageous in defeating certain side-channel attacks.) #ifndef RC_JITTER -#define RC_JITTER 1 // use random-delay versions of RCP instructions +#define RC_JITTER 0 // 0-7. Higher = more jitter. Governs use of random-delay versions of RCP instructions. #endif +#ifndef SH_JITTER +#define SH_JITTER 1 // Insert random delays, tagged onto SHA RNG +#endif + + //////////////////////////////////////////////////////////////////////////////////////////////////////////// // The following options can be adjusted, affecting the performance/security tradeoff // Period = X means that the operation in question occurs every X blocks, so higher = more performance and lower security. // No point in making them more than 16 or so, since the time taken by the subroutines would be negligible. -// These must be a power of 2. Timings as of commit 24277d13 -// RK_ROR=0 RK_ROR=1 -// Baseline time per 16-byte block = { 14066 14336 } cycles +// These must be a power of 2. Timings as of commit 82d31652 +// +// Baseline time per 16-byte block = 14109 (with no jitter) cycles #ifndef REFCHAFF_PERIOD -#define REFCHAFF_PERIOD 1 // Extra cost per 16-byte block = { 462 462 }/REFCHAFF_PERIOD cycles +#define REFCHAFF_PERIOD 1 // Extra cost per 16-byte block = 474/REFCHAFF_PERIOD cycles #endif #ifndef REMAP_PERIOD -#define REMAP_PERIOD 4 // Extra cost per 16-byte block = { 4131 4131 }/REMAP_PERIOD cycles +#define REMAP_PERIOD 4 // Extra cost per 16-byte block = 4148/REMAP_PERIOD cycles #endif #ifndef REFROUNDKEYSHARES_PERIOD -#define REFROUNDKEYSHARES_PERIOD 1 // Extra cost per 16-byte block = { 1107 1212 }/REFROUNDKEYSHARES_PERIOD cycles +#define REFROUNDKEYSHARES_PERIOD 1 // Extra cost per 16-byte block = 1304/REFROUNDKEYSHARES_PERIOD cycles #endif #ifndef REFROUNDKEYHVPERMS_PERIOD -#define REFROUNDKEYHVPERMS_PERIOD 1 // Extra cost per 16-byte block = { 936 1422 }/REFROUnDKEYVPERM_PERIOD cycles +#define REFROUNDKEYHVPERMS_PERIOD 1 // Extra cost per 16-byte block = 1486/REFROUNDKEYVPERM_PERIOD cycles #endif // Setting NUMREFSTATEVPERM to X means that state vperm refreshing happens on the first X AES rounds only, @@ -66,5 +78,13 @@ // The rationale for doing it this way is that later rounds should be protected by CT_BPERM. // NUMREFSTATEVPERM can be from 0 to 14. #ifndef NUMREFSTATEVPERM -#define NUMREFSTATEVPERM 7 // Extra cost per 16-byte block = 80*NUMREFSTATEVPERM cycles +#define NUMREFSTATEVPERM 7 // Extra cost per 16-byte block = 61*NUMREFSTATEVPERM cycles +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define MAX_NUM_BLOCKS 32768 + +#if SH_JITTER && !GEN_RAND_SHA +#error GEN_RAND_SHA must be set if you want to use SH_JITTER #endif diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c index 79fb8fb10..814ce70cc 100644 --- a/bootloaders/encrypted/enc_bootloader.c +++ b/bootloaders/encrypted/enc_bootloader.c @@ -16,51 +16,17 @@ #include "config.h" -volatile uint32_t systick_data[18]; // count, R0-R15,RETPSR - -extern void remap(); -extern uint32_t gen_rand_sha(); -extern void init_key(uint8_t *key); -extern void gen_lut_sbox(); -extern int ctr_crypt_s(uint8_t*iv,uint8_t*buf,int nblk); - -extern uint8_t rkey_s[480]; -extern uint8_t lut_a[256]; -extern uint8_t lut_b[256]; -extern uint32_t lut_a_map[1]; -extern uint32_t lut_b_map[1]; -extern uint32_t rstate_sha[4],rstate_lfsr[2]; - -void resetrng() { - uint32_t f0,f1; - do f0=get_rand_32(); while(f0==0); // make sure we don't initialise the LFSR to zero - f1=get_rand_32(); - rstate_sha[0]=f0&0xffffff00; // bottom byte must be zero (or 4) for SHA, representing "out of data" - rstate_sha[1]=f1; - rstate_sha[2]=0x41414141; - rstate_sha[3]=0x41414141; - rstate_lfsr[0]=f0; // must be nonzero for non-degenerate LFSR - rstate_lfsr[1]=0x1d872b41; // constant that defines LFSR -#if GEN_RAND_SHA - reset_block(RESETS_RESET_SHA256_BITS); - unreset_block(RESETS_RESET_SHA256_BITS); -#endif -} +#define OTP_KEY_PAGE 30 -static void init_lut_map() { - int i; - for(i=0;i<256;i++) lut_b[i]=gen_rand_sha()&0xff, lut_a[i]^=lut_b[i]; - lut_a_map[0]=0; - lut_b_map[0]=0; - remap(); -} +extern void decrypt(uint8_t* key4way, uint8_t* iv, uint8_t(*buf)[16], int nblk); -static void init_aes() { - resetrng(); - gen_lut_sbox(); - init_lut_map(); +// The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins. +// That is a suitable point to lock the OTP area where key information is stored. +void lock_key() { + otp_hw->sw_lock[OTP_KEY_PAGE] = 0xf; } + static __attribute__((aligned(4))) uint8_t workarea[4 * 1024]; int main() { @@ -182,13 +148,10 @@ int main() { for (int i=0; i < 4; i++) printf("%08x\n", *(uint32_t*)(SRAM_BASE + i*4)); - init_aes(); // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE; - init_key((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x780)])); - otp_hw->sw_lock[30] = 0xf; - ctr_crypt_s(iv, (void*)SRAM_BASE, data_size/16); + decrypt((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & (OTP_KEY_PAGE * 0x40))]), iv, (void*)SRAM_BASE, data_size/16); printf("Post decryption image begins with\n"); for (int i=0; i < 4; i++)