From f813af63d81b3a104bc0d73f587603e698f3f9e7 Mon Sep 17 00:00:00 2001
From: William Vinnicombe <william.vinnicombe@raspberrypi.com>
Date: Mon, 24 Feb 2025 17:34:56 +0000
Subject: [PATCH] Update enc_bootloader with latest aes.S (picotool 333d571c)

CK_JITTER is removed as the enc_bootloader runs from XOSC not ROSC
---
 bootloaders/encrypted/aes.S            | 681 ++++++++++++++++---------
 bootloaders/encrypted/config.h         |  46 +-
 bootloaders/encrypted/enc_bootloader.c |  53 +-
 3 files changed, 488 insertions(+), 292 deletions(-)

diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S
index ad6c448d8..e0d653237 100644
--- a/bootloaders/encrypted/aes.S
+++ b/bootloaders/encrypted/aes.S
@@ -1,3 +1,13 @@
+/* MEMORY LAYOUT ASSUMPTIONS
+
+The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see
+the macro getchaffaddress.
+
+The stack must be located at the end of Y scratch RAM: see the memory
+wiping at the end of ctr_crypt_s where memory between the start of Y
+scratch RAM and the stack pointer is overwritten.
+*/
+
 .syntax unified
 .cpu cortex-m33
 .thumb
@@ -5,26 +15,24 @@
 #include "config.h"
 #include "hardware/platform_defs.h"
 #include "hardware/regs/addressmap.h"
+#include "hardware/regs/clocks.h"
 #include "hardware/regs/sha256.h"
+#include "hardware/regs/resets.h"
+#include "hardware/regs/rosc.h"
+#include "hardware/regs/trng.h"
 #include "hardware/rcp.h"
 
-.global gen_lut_sbox
-.global ctr_crypt_s
-.global remap
-.global gen_rand_sha
-.global init_key
+.global decrypt
+.global chaff
 
-.global rkey_s
-.global lut_a,lut_a_map
-.global lut_b,lut_b_map
-.global rstate_sha,rstate_lfsr
+.extern lock_key
 
 @ RCP macros
 
 #define CTAG0  0x2a
 #define CTAG1  0x2b
 #define CTAG2  0x2c
-#define CTAG3  0x2d @ not used
+#define CTAG3  0x2d
 #define CTAG4  0x2e
 #define CTAG5  0x30
 #define CTAG6  0x31
@@ -41,9 +49,13 @@
 #define CTAG17 0x3c
 #define CTAG18 0x3d @ not used
 
-.macro SET_COUNT n
+@ number of blocks from the TRNG processed to initialise rstate_sha
+#define TRNG_BLOCKS 25
+
+@ The lower jitterpriorty is, the more the jitter
+.macro SET_COUNT n,jitterpriority
 .if RC_COUNT
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_count_set \n
 .else
  rcp_count_set_nodelay \n
@@ -51,9 +63,9 @@
 .endif
 .endm
 
-.macro CHK_COUNT n
+.macro CHK_COUNT n,jitterpriority
 .if RC_COUNT
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_count_check \n
 .else
  rcp_count_check_nodelay \n
@@ -61,9 +73,9 @@
 .endif
 .endm
 
-.macro GET_CANARY rx,tag
+.macro GET_CANARY rx,tag,jitterpriority
 .if RC_CANARY
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_canary_get \rx,\tag
 .else
  rcp_canary_get_nodelay \rx,\tag
@@ -71,9 +83,9 @@
 .endif
 .endm
 
-.macro CHK_CANARY rx,tag
+.macro CHK_CANARY rx,tag,jitterpriority
 .if RC_CANARY
-.if RC_JITTER
+.if RC_JITTER > \jitterpriority
  rcp_canary_check \rx,\tag
 .else
  rcp_canary_check_nodelay \rx,\tag
@@ -81,18 +93,6 @@
 .endif
 .endm
 
-.macro GET_CANARY_NJ rx,tag  @ with no jitter even if you ask for it (for situations where it would otherwise slow things down a lot)
-.if RC_CANARY
- rcp_canary_get_nodelay \rx,\tag
-.endif
-.endm
-
-.macro CHK_CANARY_NJ rx,tag  @ with no jitter even if you ask for it
-.if RC_CANARY
- rcp_canary_check_nodelay \rx,\tag
-.endif
-.endm
-
 .macro clear03 offset=0
  getchaffaddress r0,\offset
  ldmia r0,{r0-r3}
@@ -112,7 +112,9 @@
 @ Put workspace in the second scratch area
 @ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants,
 @ otherwise they may end up silently replaced with 0 or 0xffffffff
-.section .scratch_y.aes,"a",%progbits
+.section .scratch_y.aes,"aw",%progbits
+
+workspace_start:
 
 @ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress
 @ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000)
@@ -126,6 +128,37 @@
 chaff:
 .space 48
 
+.balign 16
+rkey_s:                      @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words
+                             @ see comment at init_key_4way for description of layout and meaning of rkey_s
+.space 600
+rkey4way:                    @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space
+.space 128
+.if CT_BPERM
+bperm_rand:                  @ 32 half words that define the oblivious permutation of blocks
+.space 64
+.endif
+
+.balign 16
+permscratch:                 @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s)
+perm16:
+.space 16
+@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s
+.balign 16
+fourway:                     @ Must be 0 mod 16
+shareA:                      @ 0 mod 16
+.space 20                    @ Only need 16 bytes, but choosing shareB!=shareA mod 16
+shareB:                      @ 4 mod 16
+.space 20
+shareC:                      @ 8 mod 16
+.space 4
+statevperm:                  @ 12 mod 16
+.space 4                     @ vperm state rotation: only last two bits are operational; other bits random
+RKshareC:                    @ Round key common share C; see comment at init_key_4way for explanation
+.space 4
+RKshareCchange:              @ Temporary used by ref_roundkey_share_s
+.space 4
+
 @ Regardless of configuration, the code uses a single 256-entry LUT,
 @ which is a simple S-box table.
 @ The LUT is represented as two shares, lut_a and lut_b,
@@ -143,7 +176,22 @@ chaff:
 @ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁
 .balign 16
 lut_a:                       @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup)
-.space 256
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
 lut_a_map:                   @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b
 .space 4
 .space 4                     @ align to 8 mod 16
@@ -152,38 +200,17 @@ lut_b:                       @ LUT share B (must be 8 mod 16 so that init_key_sb
 lut_b_map:
 .space 4
 .space 4                     @ align to multiple of 8
-rkey_s:                      @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words
-                             @ every fourth word has a word that is used as a vperm count, and also as a spacer to misalign the shares mod 16
-.space 600
-rkey4way:                    @ scratch area for init_key; could overlap this with other scratch space if need to save space
-.space 128
-.if CT_BPERM
-bperm_rand:                  @ 32 half words that define the oblivious permutation of blocks
-.space 64
-.endif
+
 .balign 16
+rstate_all_start:            @ Mark start of RNG data to allow selective memory wipe
 rstate_sha:                  @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero
 .space 16
-rstate_lfsr:                 @ 32-bit LFSR random state and constant used to step it (initialised by C program)
-.space 8
-.balign 16
-permscratch:                 @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s)
-perm16:
-.space 16
-@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s
-.balign 16
-fourway:                     @ Must be 0 mod 16
-shareA:                      @ 0 mod 16
-.space 20                    @ Only need 16 bytes, but choosing shareB!=shareA mod 16
-shareB:                      @ 4 mod 16
-.space 20
-shareC:                      @ 8 mod 16
+jstate:                      @ 32-bit jitter state
 .space 4
-statevperm:                  @ 12 mod 16
-.space 4                     @ vperm state rotation: only last two bits are operational; other bits random
-RKshareC:
+rstate_lfsr:                 @ 32-bit LFSR random state and constant used to step it
 .space 4
-.balign 16
+.word 0x1d872b41             @ constant that defines a maximal-length LFSR
+rstate_all_end:              @ Mark end of RNG data to allow selective memory wipe
 
 .if CT_BPERM
 .balign 16
@@ -195,7 +222,88 @@ murmur3_constants:           @ Five constants used in murmur3_32 hash
 .word 0xc2b2ae35
 .endif
 
-@ Put main code in first scratch area
+scratch_y_end:
+
+@ Initialisation code in main .text section
+.section .text,"ax",%progbits
+
+@ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments.
+@ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some
+@ random numbers.
+@ Trashes r0-r6
+.balign 4
+init_rstate:
+ CHK_COUNT 24,6
+ ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET
+ ldr r5,=SHA256_BASE
+ movs r1,#1
+ str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]
+ ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]     @ reads as 0
+ movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS                     @ initialise SHA internal state by writing START bit
+ str r1,[r5,#SHA256_CSR_OFFSET]
+ str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET       -TRNG_RNG_IMR_OFFSET]
+ movs r6,#TRNG_BLOCKS*2+1                                            @ odd so that we break out of the loop half-way through loading the SHA hardware, giving
+                                                                     @ time for previous SHA computation to complete
+2:
+ movs r1,#0xff                                                       @ TRNG setup is inside loop in case it is skipped.
+ str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET]     @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples
+ str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]     @ start ROSC if it is not already started
+ str r1,[r4,#TRNG_RNG_ICR_OFFSET           -TRNG_RNG_IMR_OFFSET]     @ clear all interrupts (including EHR_VLD)
+ adds r0,r4,#TRNG_EHR_DATA0_OFFSET         -TRNG_RNG_IMR_OFFSET
+ movs r2,#TRNG_TRNG_BUSY_OFFSET            -TRNG_RNG_IMR_OFFSET
+1:
+ ldr r1,[r4,r2]                                                      @ wait for 192 ROSC samples to fill EHR,should take constant time
+ cmp r1,#0
+ bne 1b
+ subs r6,#1                                                          @ done?
+ beq 3f
+ movs r1,#8
+1:
+ ldmia r0!,{r2}                                                      @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1)
+ str r2,[r5,#SHA256_WDATA_OFFSET]                                    @ for a total of half a SHA-256 block
+ subs r1,#1
+ bne 1b
+ ldr r2,[r5,#SHA256_SUM0_OFFSET]                                     @ TRNG is now sampling again; use some SHA bits to modulate the chain length
+ str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]
+ b.n 2b
+
+3:
+ CHK_COUNT 25,6
+ str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]     @ turn off rand source and wipe SHA bits left in TRNG config; r1=0
+ str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]
+ adds r5,r5,#SHA256_SUM0_OFFSET
+ ldmia r5!,{r0-r3}
+ ldr r5,=rstate_sha
+ stmia r5,{r0-r3}
+ CHK_COUNT 26,6
+
+@ r5=rstate_sha
+ movs r0,#0
+ strb r0,[r5]      @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data"
+@ try to find a non-zero initialiser to create a non-degenerate LFSR
+ ldr r1,[r5,#4]
+ cbnz r1,1f        @ is word 1 non-zero? then use it
+ ldr r1,[r5,#8]
+ cbnz r1,1f        @ otherwise, is word 2 non-zero? use it
+ ldr r1,[r5,#12]
+ cbnz r1,1f        @ otherwise, is word 3 non-zero? use it
+ mov r1,r5         @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-96} probability)
+1:
+ str r1,[r5,#rstate_lfsr-rstate_sha]
+ ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE
+ str r1,[r2,#0]
+ CHK_COUNT 27,6
+.if GEN_RAND_SHA
+.if SH_JITTER
+ movs r2,#0
+ str r2,[r5,#jstate-rstate_sha]
+.endif
+.endif
+
+ CHK_COUNT 28,6
+ bx r14
+
+@ Put AES core code in first scratch area
 .section .scratch_x.aes,"ax",%progbits
 
 .if GEN_RAND_SHA
@@ -209,11 +317,26 @@ murmur3_constants:           @ Five constants used in murmur3_32 hash
 .balign 4
 gen_rand_sha:
  push {r14}
- GET_CANARY_NJ r14,CTAG1
+ GET_CANARY r14,CTAG1,2
  push {r1-r3,r14}
+.if SH_JITTER
+ ldr r2,=rstate_sha
+ ldr r0,[r2,#jstate-rstate_sha]
+ movs r1,#1
+ movs r3,r0,lsl#2
+ ands r3,r3,#31
+ movs r3,r1,lsl r3       @ 1<<(4*(r0&7))
+ udiv r3,r3,r1           @ Takes constant + (r0&7) cycles
+ lsrs r0,r0,#1
+ bne 1f
+ bl gen_rand_sha_nonpres
+ ldr r2,=rstate_sha
+1:
+ str r0,[r2,#jstate-rstate_sha]
+.endif
  bl gen_rand_sha_nonpres
  pop {r1-r3,r14}
- CHK_CANARY_NJ r14,CTAG1
+ CHK_CANARY r14,CTAG1,0
  pop {r15}
 
 @ Return single random word in r0
@@ -273,11 +396,11 @@ gen_rand_sha_nonpres:
 gen_rand_sha:
 gen_rand_lfsr:               @ Not used
  push {r14}
- GET_CANARY_NJ r14,CTAG2
+ GET_CANARY r14,CTAG2,2
  push {r1,r2,r14}
  bl gen_rand_lfsr_nonpres
  pop {r1,r2,r14}
- CHK_CANARY_NJ r14,CTAG2
+ CHK_CANARY r14,CTAG2,0
  pop {r15}
 .endif
 
@@ -311,6 +434,56 @@ gen_rand_lfsr_nonpres:
 
 .ltorg
 
+.balign 4
+.thumb_func
+decrypt:
+ push {r14}
+ GET_CANARY r14,CTAG3,6
+ SET_COUNT 23,6
+ push {r0-r12,r14}
+ bl reset_sha_trng
+ bl init_rstate
+@ randomly re-share the LUT contents
+ ldr r4,=lut_a
+ mov r5,#64                  @ 64 words = 256 bytes
+1:
+ bl gen_rand_sha_nonpres
+ ldr r6,[r4,#lut_b-lut_a]    @ EOR a random word into both shares
+ eors r6,r6,r0
+ str r6,[r4,#lut_b-lut_a]
+ ldr r6,[r4]
+ eors r6,r6,r0
+ stmia r4!,{r6}
+ subs r5,r5,#1
+ bne 1b
+ CHK_COUNT 29,6
+ bl remap                    @ scramble the LUTs
+ pop {r0}                    @ pointer to 4way key data
+ CHK_COUNT 30,6
+ bl init_key_4way
+ CHK_COUNT 31,6
+ bl lock_key
+ pop {r0-r2}
+ bl ctr_crypt_s
+ bl randomisechaff
+ clear03
+ pop {r4-r12,r14}
+ CHK_CANARY r14,CTAG3,6
+ pop {r15}
+
+.balign 4
+.thumb_func
+reset_sha_trng:
+ ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET
+ ldr r2,[r1]
+ ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS
+ orrs r2,r2,r3
+ str r2,[r1]       @ reset the SHA hardware and the TRNG hardware
+ CHK_COUNT 23,6
+ bics r2,r2,r3
+ str r2,[r1]       @ release the reset
+ bx r14
+
 .balign 4
 .thumb_func
 makesmallperm:
@@ -321,7 +494,7 @@ makesmallperm:
 @ Trashes r0-r3
 
  push {r14}
- GET_CANARY_NJ r14,CTAG4
+ GET_CANARY r14,CTAG4,6
  push {r4-r6,r14}
  movs r4,r1
  movs r6,r0
@@ -354,7 +527,7 @@ makesmallperm:
 
 2:
  pop {r4-r6,r14}
- CHK_CANARY_NJ r14,CTAG4
+ CHK_CANARY r14,CTAG4,6
  pop {r15}
 
 .balign 4
@@ -365,7 +538,7 @@ makeperm16:
 @ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha
 @ Trashes r0-r5
 
- GET_CANARY r0,CTAG5
+ GET_CANARY r0,CTAG5,1
  push {r0,r14}
  ldr r4,=perm16
  bl gen_rand_sha_nonpres
@@ -421,7 +594,7 @@ makeperm16:
  bne 1b
 
  pop {r0,r14}
- CHK_CANARY r0,CTAG5
+ CHK_CANARY r0,CTAG5,4
  bx r14
 
 .balign 4
@@ -429,7 +602,7 @@ makeperm16:
 remap:
 @ do a random remap of the LUTs
 @ preserves r0-r11; trashes r12
- GET_CANARY r12,CTAG6
+ GET_CANARY r12,CTAG6,6
  push {r0-r12,r14}
  bl gen_rand_sha_nonpres
  ldr r1,=lut_a
@@ -438,15 +611,14 @@ remap:
  ldr r1,=lut_b
  bl remap_1
  pop {r0-r12,r14}
- CHK_CANARY r12,CTAG6
+ CHK_CANARY r12,CTAG6,6
  bx r14
 
-
 remap_1:
 @ r0: B0:xa B1:xb B2:ya B3:yb
 @ r1: array of 256 bytes, followed by a 4-byte map
 @ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0
- GET_CANARY_NJ r6,CTAG7
+ GET_CANARY r6,CTAG7,6
  push {r6,r14}
  mov r14,0x01010101
  ubfx r6,r0,#16,#8
@@ -491,7 +663,7 @@ remap_1:
  subs r2,r2,#4
  bpl 1b
  pop {r6,r14}
- CHK_CANARY_NJ r6,CTAG7
+ CHK_CANARY r6,CTAG7,6
  bx r14
 
 .if RK_ROR
@@ -511,7 +683,7 @@ ref_roundkey_shares_s_test:  @ entry point for test code to do fewer than 15 rou
  ldr r4,=rkey_s
  loadlfsr
  steplfsr                    @ r0=change in RKshareC
- adr r2,RKshareCchange
+ ldr r2,=RKshareCchange
  str r0,[r2]
  ldr r3,=RKshareC
  ldr r5,[r3]
@@ -535,7 +707,8 @@ ref_roundkey_shares_s_loop:
  steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
  steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16;                    str r3,[r4,r9,lsl#2]
 
- ldr r3,RKshareCchange
+ ldr r3,=RKshareCchange
+ ldr r3,[r3]
  movs r2,#0
  usub8 r10,r2,r10
  ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2
@@ -554,9 +727,6 @@ ref_roundkey_shares_s_loop:
  clear03 24
 ref_roundkey_shares_s_exit:
  bx r14
- .balign 4
-RKshareCchange:
- .space 4
 
 .balign 4
 .thumb_func
@@ -570,7 +740,7 @@ RKshareCchange:
 ref_roundkey_hvperms_s:
  movs r7,#30
 ref_roundkey_hvperms_s_test:  @ entry point for test code to do fewer than 30 key shares
- GET_CANARY r10,CTAG9
+ GET_CANARY r10,CTAG9,6
  push {r10,r14}
  ldr r10,=rkey_s
 ref_roundkey_hvperms_s_loop:
@@ -592,7 +762,7 @@ ref_roundkey_hvperms_s_loop:
  clear03 28
 ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to analysis code
  pop {r10,r14}
- CHK_CANARY r10,CTAG9
+ CHK_CANARY r10,CTAG9,6
  bx r14
 
 .else
@@ -604,7 +774,7 @@ ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to ana
 ref_roundkey_shares_s:
  mov r11,#15                 @ there are 15 expanded keys
 ref_roundkey_shares_s_test:  @ entry point for test code to do fewer than 15 rounds
- GET_CANARY r4,CTAG8
+ GET_CANARY r4,CTAG8,6
  push {r4,r14}
  ldr r4,=rkey_s
  loadlfsr
@@ -641,7 +811,7 @@ ref_roundkey_shares_s_loop:
  clear03 24
 ref_roundkey_shares_s_exit:
  pop {r4,r14}
- CHK_CANARY r4,CTAG8
+ CHK_CANARY r4,CTAG8,6
  bx r14
 
 .balign 4
@@ -651,7 +821,7 @@ ref_roundkey_shares_s_exit:
 ref_roundkey_hvperms_s:
  movs r7,#30
 ref_roundkey_hvperms_s_test:  @ entry point for test code to do fewer than 30 key shares
- GET_CANARY r0,CTAG9
+ GET_CANARY r0,CTAG9,6
  push {r0,r14}
  bl gen_rand_lfsr_nonpres
  ldr r1,=rkey_s
@@ -679,11 +849,13 @@ ref_roundkey_hvperms_s_loop:
  clear03 28
 ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to analysis code
  pop {r0,r14}
- CHK_CANARY r0,CTAG9
+ CHK_CANARY r0,CTAG9,6
  bx r14
 
 .endif
 
+.ltorg
+
 .if ST_VPERM
 .balign 4
 .thumb_func
@@ -733,7 +905,7 @@ addstatevperm_exit:           @ label exit point to be to able to specify to ana
 @ Trashes r0-r3,r12
 .balign 4
 ns_to_s:
- GET_CANARY r12,CTAG11
+ GET_CANARY r12,CTAG11,6
  push {r12,r14}
 .if ST_SHAREC
  bl gen_rand_sha_nonpres                   @ Create state share C; all bytes the same
@@ -765,7 +937,7 @@ ns_to_s:
  bl addstatevperm                          @ Initialise state vperm with SHA RNG, refresh with LFSR RNG
 .endif
  pop {r12,r14}
- CHK_CANARY r12,CTAG11
+ CHK_CANARY r12,CTAG11,6
  bx r14
 
 @ Conjugate lut_a, lut_b with shareC
@@ -863,8 +1035,6 @@ shift_rows_s:
 
 @ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1
 .macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b
-@ !!! can probably save some registers, e.g. allow trashing of r0x00, r0x1b
-@ can possibly also simplify slightly with refactorisation
  uadd8 \rt,\rx,\rx           @ field multiplication by 2 as above
  sel \rw,\r0x1b,\r0x00
  eors \rt,\rt,\rw            @ 2x
@@ -904,51 +1074,6 @@ mix_cols_s:
  ldmia r12!,{r0,r1}          @ overwrite  sensitive shareB-related quantities r0,r1 with random numbers
  bx r14
 
-.balign 4
-.thumb_func
-gen_lut_sbox:
-@ gen_lut_sbox sets both lut_a and lut_b to the S-box table and
-@ returns r0=lut_a+256, r1=lut_b+256
-@ first set lut_a to be a table of GF(2⁸) inverses, using lut_b as temporary storage
- ldr r0,=lut_a
- ldr r1,=lut_b
-@ first set lut_a to be a table of antilogarithms, lut_b a table of logarithms
- mov r2,#0
- strb r2,[r0]                @ (*)
- mov r3,#1                   @ we maintain invariant that r2=log(r3)
-1:
- strb r2,[r0,r3]             @ log table
- strb r3,[r1,r2]             @ antilog table
- lsls r12,r3,#25
- it cs
- eorcs r12,r12,#0x1b000000   @ multiply by x
- eor r3,r3,r12,lsr#24        @ multiply by x+1 ("3"), which is a primitive element
- add r2,r2,#1
- cmp r2,#255
- bls 1b
- movs r2,#255
-1:
- ldrb r3,[r0,r2]             @ for each i≠0, find log,...
- eor r3,r3,#255              @ ... negate...
- ldrb r3,[r1,r3]             @ ... and antilog to get inverse
- strb r3,[r0,r2]
- subs r2,r2,#1
- bne 1b                      @ note that inverse(0)=0 by (*) above
-@ At this point r0=lut_a, r1=lut_b, lut_a[] contains inverses and lut_b[] contains other stuff
- mov r12,#256
-1:
- ldrb r2,[r0]
- eors r3,r2,r2,lsl#1         @ convolve byte with 0x1f
- eors r3,r3,r3,lsl#2
- eors r3,r3,r2,lsl#4
- eors r2,r3,r3,lsr#8
- eor r2,r2,#0x63             @ and add 0x63
- strb r2,[r0],#1             @ let lut_a[i]=sbox[i]
- strb r2,[r1],#1             @ let lut_b[i]=sbox[i]
- subs r12,r12,#1
- bne 1b
- bx r14
-
 @ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups)
 .macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3
  ubfx \Rspare0,\Rtarg,#0,  #8
@@ -970,79 +1095,106 @@ gen_lut_sbox:
 .balign 4
 .thumb_func
 map_sbox_s:
- GET_CANARY r12,CTAG12
+ GET_CANARY r12,CTAG12,3
  push {r12,r14}
 
  ldr r0,=shareA                 @ Write out state share A to memory
- stmia r0,{r4-r7}
- clear03                        @ barrier
+@ stmia r0,{r4-r7}              @ Used to do a STM
+ getchaffaddress r1
+ ldr r2,[r1]
+ str r4,[r0]                    @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms,
+ str r2,[r1]                    @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired
+ str r5,[r0,#4]                 @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic.
+ str r2,[r1]                    @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1.
+ str r6,[r0,#8]                 @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but
+ str r2,[r1]                    @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic.
+ str r7,[r0,#12]
+ str r2,[r1]
 
  ldr r0,=shareB                 @ Write out state share B to memory
- stmia r0,{r8-r11}
- clear03 4                      @ barrier
+ stmia r0,{r8-r11}              @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with
 
  bl makeperm16                  @ Rebuild random 16-way permutation. Maybe do this less frequently
 @ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation
 
+ bl gen_rand_sha_nonpres
+ mov r11,r0
  ldr r8,=lut_a
  ldr r9,=lut_b
  ldr r0,[r8,#0x100]             @ R0 = a0 | a1<<8 | c0<<16 | c1<<24   (lut_a_map)
- eors r10,r0,r0,lsr#8
- uxtb r10,r10                   @ R10 = a0^a1
+ eors r3,r0,r0,lsr#8            @ R3 = a0^a1 | junk
+ uxtb r10,r3
  ldr r1,[r9,#0x100]             @ R1 = b0 | b1<<8 | d0<<16 | d1<<24   (lut_b_map)
  eors r1,r0,r1
  eors r2,r1,r1,lsr#8
- uxtb r11,r2                    @ R11 = a0^a1^b0^b1
  movs r12,r1,lsr#16             @ R12 = c0^d0 | (c1^d1)<<8
+ bfi r12,r2,#16,#8              @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16
 
  ldr r4,=perm16
  ldr r5,=shareA
  ldr r6,=shareB
-@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=a0^a1^b0^b1, r12=(c0^d0) | (c1^d1)<<8
+ movs r1,#0;movs r2,#0;movs r3,#0
+@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16
  movs r0,#15
 1:                              @ (Ordering instructions to minimise result delays)
  ldrb r1,[r4,r0]                @ r1 = perm[r0]
+ mov  r11,r11,ror#11            @ Rotate random 32 bits to present a new low 8 bits
  eors r7,r1,#2                  @ r7 = perm[r0]^2
  ldrb r2,[r5,r1]                @ r2 = shareA[perm[r0]]
+ eor  r11,r11,r2,ror#8          @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted)
  ldrb r3,[r6,r7]                @ r3 = shareB[perm[r0]^2]
- eors r2,r2,r10                 @ r2 = shareA[perm[r0]]^a0^a1
+ eor  r2,r2,r10                 @ r2 = shareA[perm[r0]]^a0^a1
  eors r2,r2,r3                  @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]
  ldrb r3,[r8,r2]                @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]
- eors r3,r3,r12                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8)
- eors r2,r2,r11                 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]
- strb r3,[r5,r1]                @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0
+ eor  r2,r2,r12,lsr#16          @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]
+ eor  r3,r3,r12                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8)
+ eor  r3,r3,r11                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8)
+ strb r3,[r5,r1]                @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand
  ldrb r3,[r9,r2]                @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]
  subs r0,r0,#1
- eor  r3,r3,r12,lsr#8           @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1
- strb r3,[r6,r7]                @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1
+ eor  r3,r3,r11                 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand
+ eor  r3,r3,r12,lsr#8           @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8)
+ strb r3,[r6,r7]                @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1
  bpl 1b
  clear03 8                      @ barrier
 
  ldmia r6,{r8-r11}              @ Read state share B back from memory
  clear03 12                     @ barrier
- ldmia r5,{r4-r7}               @ Read state share A back from memory
- clear03 16                     @ barrier
+ getchaffaddress r0,16
+ bfi r0,r5,#0,#4                @ match chaff pointer (r0) to share A location (R5) mod 16
+ @ldmia r5,{r4-r7}               @ Read state share A back from memory
+ @clear03 16                     @ barrier
+ ldr r4,[r5]                    @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s
+ ldr r1,[r0]
+ ldr r6,[r5,#8]
+ ldr r1,[r0,#8]
+ ldr r7,[r5,#12]
+ ldr r1,[r0,#12]
+ ldr r5,[r5,#4]                 @ Do r5 last because it's the address register
+ ldr r1,[r0,#4]
 
 @ Refresh state shares because luts only give imperfect share-by-value
-
- loadlfsr
- steplfsr; eors r4,r4,r0; mov r12,#0; eors r8,r8,r0,ror#16              @ Barriers between each pair of eors to prevent implicit r4^r8 etc
- steplfsr; eors r5,r5,r0; mov r12,#0; eors r9,r9,r0,ror#16
- steplfsr; eors r6,r6,r0; mov r12,#0; eors r10,r10,r0,ror#16
- steplfsr; eors r7,r7,r0; mov r12,#0; eors r11,r11,r0,ror#16
- savelfsr
+@ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent)
+@ loadlfsr
+@ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16              @ Barriers between each pair of eors to prevent implicit r4^r8 etc
+@ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16
+@ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16
+@ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16
+@ savelfsr
 
  pop {r12,r14}
- CHK_CANARY r12,CTAG12
+ CHK_CANARY r12,CTAG12,5
  bx r14
 
+.ltorg
+
 .balign 4
 .thumb_func
 randomisechaff:
 @ Randomise 48 bytes of chaff values (random load values)
 @ Uses 12 bytes of permscratch
 @ Trashes r0-3
- GET_CANARY r0,CTAG13
+ GET_CANARY r0,CTAG13,6
  push {r0,r14}
  movs r0,#12
  ldr r1,=permscratch
@@ -1059,19 +1211,32 @@ randomisechaff:
  subs r1,r1,#1
  bpl 1b
  pop {r0,r14}
- CHK_CANARY r0,CTAG13
+ CHK_CANARY r0,CTAG13,6
  bx r14
 
 .balign 4
-refreshchaff:
+refreshchaff_and_lfsr:
 @ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff
+@ Re-randomise LFSR with SHA
 @ Uses 12 bytes of permscratch
 @ Trashes r0-3,12
- GET_CANARY r0,CTAG14
+ GET_CANARY r0,CTAG14,6
  push {r0,r14}
+
+@ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence
+ bl gen_rand_sha_nonpres
+ ldr r1,=rstate_lfsr
+ ldr r2,[r1]
+ adds r2,r2,r0
+ beq 1f           @ Don't update LFSR state to 0
+ str r2,[r1]
+1:
+
+@ Choose a random order to update chaff words to make 2nd order attacks harder
  movs r0,#12
  ldr r1,=permscratch
- bl makesmallperm           @ Update the random words in a random order to make 2nd order attacks harder
+ bl makesmallperm
+ 
  movs r1,#11
 1:
  push {r1}
@@ -1086,7 +1251,7 @@ refreshchaff:
  subs r1,r1,#1
  bpl 1b
  pop {r0,r14}
- CHK_CANARY r0,CTAG14
+ CHK_CANARY r0,CTAG14,6
  bx r14
 
 .balign 4
@@ -1094,7 +1259,7 @@ refreshchaff:
 @ Do sbox on the four bytes of the 4-way share r4-r7
 @ Trashes r0,r8-r12
 init_key_sbox:
- GET_CANARY r12,CTAG15
+ GET_CANARY r12,CTAG15,6
  push {r1-r3,r12,r14}
  bl gen_rand_sha_nonpres; mov r8,r0
  bl gen_rand_sha_nonpres; mov r9,r0
@@ -1113,16 +1278,16 @@ init_key_sbox:
  movs r0,#4
  ldr r1,=permscratch
  bl makesmallperm               @ Build random 4-way permutation determining order of bytes to be SBOXed
- ldr r1,=permscratch            @ Write out random addresses in advance to save two registers
+ ldr r1,=permscratch            @ Write out random addresses in advance to save two registers (reusing permscratch)
  ldr r4,[r1]
  ldr r0,=fourway
  uxtab r5,r0,r4
  uxtab r6,r0,r4,ror#8
  uxtab r7,r0,r4,ror#16
  uxtab r8,r0,r4,ror#24
- stmia r1,{r5-r8}               @ Store fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3]
+ stmia r1,{r5-r8}               @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3]
 
- bl gen_rand_sha                    @ Save some randomness for the resharing operation later
+ bl gen_rand_sha                @ Save some randomness for the resharing operation later
  movs r7,r0
  bl gen_rand_sha
  movs r8,r0
@@ -1148,8 +1313,8 @@ init_key_sbox:
  ldr  r6,[r1],#4                @ r6 = fourway + perm[i] (i=0-3, loop iteration)
  ldr  r5,[r5]                   @ Random load to mask previous load
 
- ands r9,r6,#12                 @ r9 = chaff address aligned to r6 mod 16
- add  r9,r11,r9
+ ands r9,r6,#12
+ add  r9,r11,r9                 @ r9 = chaff address aligned to (r6 bic 3) mod 16
  ldrb r4,[r6,#0]
  ldr  r14,[r9,#0]               @ Random load to mask previous load
  eor  r4,r4,r10
@@ -1171,7 +1336,7 @@ init_key_sbox:
  eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
 
  ands r14,r4,#255
- ldrb r5,[r2,r14]                @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]
+ ldrb r5,[r2,r14]               @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]
  and  r14,r4,#15
  add  r14,r14,#32
  ldrb r14,[r11,r14]             @ Random load to mask previous load (r2 and r11 are both 0 mod 16)
@@ -1207,7 +1372,7 @@ init_key_sbox:
  ldmia r11,{r8-r12,r14}         @ Random load to mask previous load and to obfuscate registers
 
  pop {r1-r3,r12,r14}
- CHK_CANARY r12,CTAG15
+ CHK_CANARY r12,CTAG15,6
  bx r14
 
 .balign 4
@@ -1221,7 +1386,7 @@ init_key_sbox:
 @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4])
 @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16
 storeroundkey:
- GET_CANARY r8,CTAG16
+ GET_CANARY r8,CTAG16,6
  push {r2,r8,r14}
 
 @ eor two 4-way share components to make a component of a 2-way share
@@ -1291,12 +1456,12 @@ storeroundkey:
  adds r3,r3,#4               @ Set     r3 = (r3 on entry) + 40
 
  pop {r2,r8,r14}
- CHK_CANARY r8,CTAG16
+ CHK_CANARY r8,CTAG16,6
  bx r14
 
 .balign 4
 .thumb_func
-init_key:
+init_key_4way:
 @ On entry, r0 points to 4-way shared raw key data (128 bytes)
 @ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7
 @ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K.
@@ -1312,17 +1477,21 @@ init_key:
 @ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16
 @ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC
 
- GET_CANARY r12,CTAG17
- push {r4-r11,r12,r14}
+ GET_CANARY r12,CTAG17,6
+ push {r0-r12,r14}
  
+@ Transfer 4-way key into local workspace, rerandomising the shares
  mov r5,r0                   @ r5=4-way key input
  bl randomisechaff
- ldr r4,=rkey4way
- movs r6,#8
+ ldr r6,=rkey4way
+ movs r7,#8
 1:
- ldmia r5!,{r0-r3}
- stmia r4!,{r0-r3}
- subs r6,r6,#1
+ ldmia r5!,{r1-r4}
+ bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0
+ bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0
+ bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0
+ stmia r6!,{r1-r4}
+ subs r7,r7,#1
  bne 1b
 
 @ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for
@@ -1400,10 +1569,12 @@ init_key_expandloop:
  cmp r2,#52
  bne init_key_expandloop
 
- pop {r4-r11,r12,r14}
- CHK_CANARY r12,CTAG17
+ pop {r0-r12,r14}
+ CHK_CANARY r12,CTAG17,6
  bx r14
 
+.ltorg
+
 @ Add the round key shares pointed to by r12 into the state shares
 @ Trashes r0-r3
 .balign 4
@@ -1421,7 +1592,7 @@ addrkey_s:
  ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
  ldr r2,[r0,#16]             @ barrier load
 
- rsbs r2,r3,r1,lsr#30        @ r2=vpermkeyrot-vpermstaterot
+ rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
 @ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot
 @ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr
 .if RK_ROR
@@ -1444,7 +1615,7 @@ addrkey_s:
  bfi r0,r12,#0,#4            @ match chaff pointer (r0) to roundkey ptr (r12) mod 16
  ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
  ldr r2,[r0,#16]             @ barrier load
- rsbs r2,r3,r1,lsr#30        @ r2=vpermkeyrot-vpermstaterot
+ rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
  ldr r3,=RKshareC            @ r3=common round key shareC
  bfi r0,r3,#0,#4
  ldr r3,[r3]
@@ -1466,7 +1637,6 @@ addrkey_s:
  ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16;                eors r11,r11,r0
 .endif
  clear03
- 
  bx r14
 
 .balign 4
@@ -1484,11 +1654,11 @@ addrkey_s:
 
 ctr_crypt_s:
 @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks
- GET_CANARY r12,CTAG0
- push {r0,r4-r11,r12,r14}
+ GET_CANARY r12,CTAG0,6
+ push {r0-r12,r14}           @ save all registers so that when we restore we overwrite any secrets
 
  push {r0-r2}
- SET_COUNT 93
+ SET_COUNT 93,6
 
 .if CT_BPERM
 @ Initialise 32 random numbers (which fit in half-words)
@@ -1505,10 +1675,10 @@ ctr_crypt_s:
  bl randomisechaff
  pop {r0-r2}
  movs r3,#0
- CHK_COUNT 93
+ CHK_COUNT 93,6
 
 ctr_crypt_mainloop:
- SET_COUNT 80
+ SET_COUNT 80,6
 @ here r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
 
 @ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it)
@@ -1517,7 +1687,7 @@ ctr_crypt_mainloop:
 
  tst r3,#(REFCHAFF_PERIOD-1)
  bne 1f
- bl refreshchaff
+ bl refreshchaff_and_lfsr
 1:
 
  ldr r3,[r13,#12]            @ get block count off the stack
@@ -1525,7 +1695,7 @@ ctr_crypt_mainloop:
  bne 1f
  bl remap                    @ shuffle the LUTs; this preserves R3
 1:
- CHK_COUNT 80
+ CHK_COUNT 80,6
 
  tst r3,#(REFROUNDKEYSHARES_PERIOD-1)
  bne 1f
@@ -1538,7 +1708,8 @@ ctr_crypt_mainloop:
  bl ref_roundkey_hvperms_s   @ refresh the round key vperms
 1:
 
- CHK_COUNT 81
+ CHK_COUNT 81,6
+
  pop {r0-r3}
 @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
 
@@ -1586,9 +1757,9 @@ ctr_crypt_mainloop:
 .else
  mov r12,r3
 .endif
- CHK_COUNT 82
+ CHK_COUNT 82,6
 
-@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered
+@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered)
  push {r0-r3,r12}
 
 processIV:                   @ non-target label to assist power analysis
@@ -1605,36 +1776,36 @@ processIV:                   @ non-target label to assist power analysis
  ldr r0,[r13]                @ peek at stack to restore r0=IV ptr
  ldmia r0,{r4-r7}            @ load IV
  clear03                     @ barrier to remove traces of IV from internal CPU load registers
- push {r0-r3}                @ We want to randomise the internal memory registers associated with the above LDM load, but this
- pop {r0-r3}                 @ may come from non-scratch memory and have its own internal registers, so we clear it using a
-                             @ stack save/load. Either R13 is in non-scratch memory, in which case this works, or it isn't, in
-                             @ which case it doesn't matter, because the only subsequent use of non-scratch memory is the stack.
 
 @ Add in r9 in byte-big-endian, bit-little-endian (!) fashion, while trying to avoid rev operations
 @ as far as possible as these tend to expose (via power fluctuations) byte-level hamming weights.
-@ It's worth avoiding revs on r6, r5, r4, even at the cost of introducing a small timing dependency.
-
 @ First do 128-bit addition of r9 to byte-reversed IV
- rev r7,r7; adds r7,r7,r9;            bcc 1f
- rev r6,r6; adcs r6,r6,#0; rev r6,r6; bcc 1f
- rev r5,r5; adcs r5,r5,#0; rev r5,r5; bcc 1f
- rev r4,r4; adcs r4,r4,#0; rev r4,r4
+ rev r7,r7
+ cmn r7,#MAX_NUM_BLOCKS      @ Compare against maximum number of blocks
+ bcs 1f
+ add r7,r7,r9                @ This can temporarily overflow but it doesn't matter as we know that r7+r12 does not overflow
+ sub r7,r7,r8
+ b 2f
 1:
-@ At this point, r7 is reversed and r4-r6 are not
+            adds r7,r7,r9
+ rev r6,r6; adcs r6,r6,#0
+ rev r5,r5; adcs r5,r5,#0
+ rev r4,r4; adcs r4,r4,#0
 @ Now do 128-bit subtraction of r8 from byte-reversed IV
-            subs r7,r7,r8; rev r7,r7; bcs 1f
- rev r6,r6; sbcs r6,r6,#0; rev r6,r6; bcs 1f
- rev r5,r5; sbcs r5,r5,#0; rev r5,r5; bcs 1f
- rev r4,r4; sbcs r4,r4,#0; rev r4,r4
-1:
+ subs r7,r7,r8
+ sbcs r6,r6,#0; rev r6,r6
+ sbcs r5,r5,#0; rev r5,r5
+ sbcs r4,r4,#0; rev r4,r4
+2:
+ rev r7,r7
  clear01 16
- CHK_COUNT 83
+ CHK_COUNT 83,6
 
 @ r4-r7 = IV for the current block
  bl ns_to_s                  @ convert IV+x to shares, which includes choosing and incorporating a random shareC
- CHK_COUNT 84
+ CHK_COUNT 84,6
  bl conjshareC               @ Add the effect of shareC to lut_a, lut_b
- CHK_COUNT 85
+ CHK_COUNT 85,6
 @ now perform the 15 encryption rounds on (key, state=IV+x)
 @ here r4-r7, r8-r11: state
  mov r2,#0                   @ round counter
@@ -1647,7 +1818,7 @@ rounds_s_mainloop:
  bl map_sbox_s
  bl shift_rows_s
 .if ST_VPERM
- ldmia r13,{r2}              @ peek at stack to get round count
+ ldr r2,[r13]                @ peek at stack to get round count
  cmp r2,#NUMREFSTATEVPERM
  bcs 1f
  bl gen_rand_lfsr_nonpres
@@ -1664,12 +1835,12 @@ rounds_s_mainloop:
  pop {r2}
  b rounds_s_mainloop
 2:
- CHK_COUNT 86
+ CHK_COUNT 86,6
  ldr r12,=rkey_s+14*40       @ final round key shares
  bl addrkey_s
- CHK_COUNT 87
+ CHK_COUNT 87,6
  bl conjshareC               @ Undo the effect of shareC from lut_a, lut_b
- CHK_COUNT 88
+ CHK_COUNT 88,6
 .if ST_VPERM
 @ Undo the effects of vperm rotation recorded in statevperm
  ldr r1,=statevperm
@@ -1682,6 +1853,7 @@ rounds_s_mainloop:
  push {r0,r3}
 @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered
 
+decryption_start:
 @ Decrypt ciphertext using AES output in shares: r4-r11
 .if ST_SHAREC
  ldr r0,=shareC
@@ -1689,38 +1861,79 @@ rounds_s_mainloop:
 .else
  movs r0,#0
 .endif
- CHK_COUNT 89
+ ldr r14,=chaff
+@ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff
+ CHK_COUNT 89,6
  add r1,r1,r12,lsl#4         @ Temporarily r1 points to block-to-be-deciphered
- ldr r3,[r1]
- eors r3,r3,r4
- eors r3,r3,r8,ror#16        @ Now r4 and r8 are free
- eors r3,r3,r0
- str r3,[r1]
- ldr r3,[r1,#4]
+ ldr r3,[r1]                 @ r3=ciphertext word
+ eors r3,r3,r4               @ r3=r3^shareA
+ ldr r4,[r14]                @ barrier load
+ eor r3,r3,r8,ror#16         @ r3=r3^shareB
+ eors r3,r3,r0               @ r3=r3^shareC
+ str r3,[r1]                 @ plaintext word=r3
+ ldr r3,[r1,#4]              @ and similarly for words 1,2,3 of block...
+ ldr r4,[r14,#4]
  eors r3,r3,r5
- eors r3,r3,r9,ror#16
+ eor r3,r3,r9,ror#16
  eors r3,r3,r0
  str r3,[r1,#4]
  ldr r3,[r1,#8]
+ ldr r4,[r14,#8]
  eors r3,r3,r6
- eors r3,r3,r10,ror#16
+ eor r3,r3,r10,ror#16
  eors r3,r3,r0
  str r3,[r1,#8]
  ldr r3,[r1,#12]
+ ldr r4,[r14,#12]
  eors r3,r3,r7
- eors r3,r3,r11,ror#16
+ eor r3,r3,r11,ror#16
  eors r3,r3,r0
  str r3,[r1,#12]
+
  sub r1,r1,r12,lsl#4         @ Restore r1 to point to start of buffer
- CHK_COUNT 90
+ CHK_COUNT 90,6
 
  pop {r0,r3}                 @ Restore IV and block counter
 @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
+decryption_end:
 
  adds r3,r3,#1
  cmp r3,r2
- CHK_COUNT 91
+ CHK_COUNT 91,6
  bne ctr_crypt_mainloop
- pop {r0,r4-r11,r12,r14}
- CHK_CANARY r12,CTAG0
+
+#if WIPE_MEMORY
+@ Wipe memory from workspace_start up to the stack pointer
+@ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals
+ ldr r4,=workspace_start
+ ldr r5,=rstate_all_start
+1:
+ bl gen_rand_sha_nonpres
+ stmia r4!,{r0}
+ cmp r4,r5
+ bcc 1b
+ ldr r4,=rstate_all_end
+ mov r5,r13                  @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register
+1:
+ bl gen_rand_sha_nonpres
+ stmia r4!,{r0}
+ cmp r4,r5
+ bcc 1b
+
+@ Then fill everything with zeros so as not to leave behind clues about the RNG state
+ ldr r4,=workspace_start
+ movs r0,#0
+ mov r5,r13
+1:
+ stmia r4!,{r0}
+ cmp r4,r5
+ bcc 1b
+#endif
+
+.if GEN_RAND_SHA
+ SET_COUNT 23,6
+ bl reset_sha_trng           @ clear out the SHA hardware
+.endif
+ pop {r0-r12,r14}
+ CHK_CANARY r12,CTAG0,6
  bx r14
diff --git a/bootloaders/encrypted/config.h b/bootloaders/encrypted/config.h
index dd0c9898e..2c4ce0d03 100644
--- a/bootloaders/encrypted/config.h
+++ b/bootloaders/encrypted/config.h
@@ -1,6 +1,6 @@
 #pragma once
 
-// These options should be enabled because the security risk of not using them is too high
+// These options (up to long /////////////// line) should be enabled because the security risk of not using them is too high
 // or because the time cost is very low so you may as well have them.
 // They can be set to 0 for analysis or testing purposes.
 
@@ -22,6 +22,10 @@
 #define RK_ROR               1         // store round key shares with random rotations within each word
 #endif
 
+#ifndef WIPE_MEMORY
+#define WIPE_MEMORY          1         // Wipe memory after decryption
+#endif
+
 // The following options should be enabled to increase resistance to glitching attacks.
 
 #ifndef RC_CANARY
@@ -31,34 +35,42 @@
 #define RC_COUNT             1         // use rcp_count feature
 #endif
 
-// Although enabling the following option likely has little theoretical benefit, in
-// practice randomising the timing of operations can make side-channel attacks very
-// much more effort to carry out. It can be disabled for analysis or testing purposes.
+// Although jitter/timing-variation may be circumventable in theory, in practice
+// randomising the timing of operations can make side-channel attacks very much more
+// effort to carry out. These can be disabled for analysis or testing purposes.
+// It is advisable to use a least one form of jitter.
 
+// RC_JITTER is quite slow, and is probably the most predictable of the three, so it is disabled by default.
+// (Leaving it as an option because it's just possible that the large delays it produces are advantageous in defeating certain side-channel attacks.)
 #ifndef RC_JITTER
-#define RC_JITTER            1         // use random-delay versions of RCP instructions
+#define RC_JITTER            0         // 0-7. Higher = more jitter. Governs use of random-delay versions of RCP instructions.
 #endif
 
+#ifndef SH_JITTER
+#define SH_JITTER            1         // Insert random delays, tagged onto SHA RNG
+#endif
+
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // The following options can be adjusted, affecting the performance/security tradeoff
 
 // Period = X means that the operation in question occurs every X blocks, so higher = more performance and lower security.
 // No point in making them more than 16 or so, since the time taken by the subroutines would be negligible.
-// These must be a power of 2. Timings as of commit 24277d13
-//                                                                            RK_ROR=0    RK_ROR=1
-//                                        Baseline time per 16-byte block = {    14066       14336 }                          cycles
+// These must be a power of 2. Timings as of commit 82d31652
+// 
+//                                        Baseline time per 16-byte block = 14109 (with no jitter)         cycles
 #ifndef REFCHAFF_PERIOD
-#define REFCHAFF_PERIOD             1     // Extra cost per 16-byte block = {      462         462 }/REFCHAFF_PERIOD          cycles
+#define REFCHAFF_PERIOD             1     // Extra cost per 16-byte block =   474/REFCHAFF_PERIOD          cycles
 #endif
 #ifndef REMAP_PERIOD
-#define REMAP_PERIOD                4     // Extra cost per 16-byte block = {     4131        4131 }/REMAP_PERIOD             cycles
+#define REMAP_PERIOD                4     // Extra cost per 16-byte block =  4148/REMAP_PERIOD             cycles
 #endif
 #ifndef REFROUNDKEYSHARES_PERIOD
-#define REFROUNDKEYSHARES_PERIOD    1     // Extra cost per 16-byte block = {     1107        1212 }/REFROUNDKEYSHARES_PERIOD cycles
+#define REFROUNDKEYSHARES_PERIOD    1     // Extra cost per 16-byte block =  1304/REFROUNDKEYSHARES_PERIOD cycles
 #endif
 #ifndef REFROUNDKEYHVPERMS_PERIOD
-#define REFROUNDKEYHVPERMS_PERIOD   1     // Extra cost per 16-byte block = {      936        1422 }/REFROUnDKEYVPERM_PERIOD  cycles
+#define REFROUNDKEYHVPERMS_PERIOD   1     // Extra cost per 16-byte block =  1486/REFROUNDKEYVPERM_PERIOD  cycles
 #endif
 
 // Setting NUMREFSTATEVPERM to X means that state vperm refreshing happens on the first X AES rounds only,
@@ -66,5 +78,13 @@
 // The rationale for doing it this way is that later rounds should be protected by CT_BPERM.
 // NUMREFSTATEVPERM can be from 0 to 14.
 #ifndef NUMREFSTATEVPERM
-#define NUMREFSTATEVPERM            7     // Extra cost per 16-byte block =  80*NUMREFSTATEVPERM cycles
+#define NUMREFSTATEVPERM            7     // Extra cost per 16-byte block =  61*NUMREFSTATEVPERM cycles
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define MAX_NUM_BLOCKS 32768
+
+#if SH_JITTER && !GEN_RAND_SHA
+#error GEN_RAND_SHA must be set if you want to use SH_JITTER
 #endif
diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c
index 79fb8fb10..814ce70cc 100644
--- a/bootloaders/encrypted/enc_bootloader.c
+++ b/bootloaders/encrypted/enc_bootloader.c
@@ -16,51 +16,17 @@
 
 #include "config.h"
 
-volatile uint32_t systick_data[18]; // count, R0-R15,RETPSR
-
-extern void remap();
-extern uint32_t gen_rand_sha();
-extern void init_key(uint8_t *key);
-extern void gen_lut_sbox();
-extern int  ctr_crypt_s(uint8_t*iv,uint8_t*buf,int nblk);
-
-extern uint8_t rkey_s[480];
-extern uint8_t lut_a[256];
-extern uint8_t lut_b[256];
-extern uint32_t lut_a_map[1];
-extern uint32_t lut_b_map[1];
-extern uint32_t rstate_sha[4],rstate_lfsr[2];
-
-void resetrng() {
-    uint32_t f0,f1;
-    do f0=get_rand_32(); while(f0==0);   // make sure we don't initialise the LFSR to zero
-    f1=get_rand_32();
-    rstate_sha[0]=f0&0xffffff00;         // bottom byte must be zero (or 4) for SHA, representing "out of data"
-    rstate_sha[1]=f1;
-    rstate_sha[2]=0x41414141;
-    rstate_sha[3]=0x41414141;
-    rstate_lfsr[0]=f0;                   // must be nonzero for non-degenerate LFSR
-    rstate_lfsr[1]=0x1d872b41;           // constant that defines LFSR
-#if GEN_RAND_SHA
-    reset_block(RESETS_RESET_SHA256_BITS);
-    unreset_block(RESETS_RESET_SHA256_BITS);
-#endif
-}
+#define OTP_KEY_PAGE 30
 
-static void init_lut_map() {
-    int i;
-    for(i=0;i<256;i++) lut_b[i]=gen_rand_sha()&0xff, lut_a[i]^=lut_b[i];
-    lut_a_map[0]=0;
-    lut_b_map[0]=0;
-    remap();
-}
+extern void decrypt(uint8_t* key4way, uint8_t* iv, uint8_t(*buf)[16], int nblk);
 
-static void init_aes() {
-    resetrng();
-    gen_lut_sbox();
-    init_lut_map();
+// The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins.
+// That is a suitable point to lock the OTP area where key information is stored.
+void lock_key() {
+    otp_hw->sw_lock[OTP_KEY_PAGE] = 0xf;
 }
 
+
 static __attribute__((aligned(4))) uint8_t workarea[4 * 1024];
 
 int main() {
@@ -182,13 +148,10 @@ int main() {
     for (int i=0; i < 4; i++)
         printf("%08x\n", *(uint32_t*)(SRAM_BASE + i*4));
 
-    init_aes();
     // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors
     uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE;
 
-    init_key((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x780)]));
-    otp_hw->sw_lock[30] = 0xf;
-    ctr_crypt_s(iv, (void*)SRAM_BASE, data_size/16);
+    decrypt((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS &  (OTP_KEY_PAGE * 0x40))]), iv, (void*)SRAM_BASE, data_size/16);
 
     printf("Post decryption image begins with\n");
     for (int i=0; i < 4; i++)