diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c8e3b0597..52abdbb1e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,6 +269,18 @@ if(NOT WOLFSSL_SINGLE_THREADED)
     endif()
 endif()
 
+# DTLS-SRTP
+add_option("WOLFSSL_SRTP"
+    "Enables wolfSSL DTLS-SRTP (default: disabled)"
+    "no" "yes;no")
+
+if(WOLFSSL_SRTP)
+    list(APPEND WOLFSSL_DEFINITIONS
+        "-DWOLFSSL_SRTP")
+    set(WOLFSSL_DTLS "yes")
+    set(WOLFSSL_KEYING_MATERIAL "yes")
+endif()
+
 
 # DTLS
 add_option("WOLFSSL_DTLS"
diff --git a/ChangeLog.md b/ChangeLog.md
index 89959661d5..5e4591149e 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -23,7 +23,7 @@ NOTE: * --enable-heapmath is being deprecated and will be removed by 2024
 * Added LMS/HSS and XMSS/XMSS^MT wolfcrypt hooks, both normal and verify-only options.
 * Added support for the AES EAX mode of operation
 * Port for use with Hitch (https://github.com/varnish/hitch) added
-* Add XTS API's to handle multiple sectors in new port ot VeraCrypt
+* Add XTS API's to handle multiple sectors in new port to VeraCrypt
 
 ## Enhancements and Optimizations
 
diff --git a/IDE/CRYPTOCELL/main.c b/IDE/CRYPTOCELL/main.c
index cc596fe8fb..7938d0dfae 100644
--- a/IDE/CRYPTOCELL/main.c
+++ b/IDE/CRYPTOCELL/main.c
@@ -27,7 +27,7 @@
 /* wolfCrypt_Init/wolfCrypt_Cleanup to turn CryptoCell hardware on/off */
 #include <wolfssl/wolfcrypt/wc_port.h>
 
-/* SEGGER_RTT_Init, you can potential replace it with other serial terminal */
+/* SEGGER_RTT_Init, you can potentially replace it with other serial terminal */
 #include "SEGGER_RTT.h"
 
 int main(void)
diff --git a/IDE/Espressif/ESP-IDF/examples/wolfssl_client/components/wolfssl/CMakeLists.txt b/IDE/Espressif/ESP-IDF/examples/wolfssl_client/components/wolfssl/CMakeLists.txt
index b5ee75c613..2f1e9e4111 100644
--- a/IDE/Espressif/ESP-IDF/examples/wolfssl_client/components/wolfssl/CMakeLists.txt
+++ b/IDE/Espressif/ESP-IDF/examples/wolfssl_client/components/wolfssl/CMakeLists.txt
@@ -206,7 +206,7 @@ else()
                           "\"${WOLFSSL_ROOT}/wolfcrypt/src\""
                           "\"${WOLFSSL_ROOT}/wolfcrypt/src/port/Espressif\""
                           "\"${WOLFSSL_ROOT}/wolfcrypt/src/port/atmel\""
-                          # TODO: Make this a univeral makefile that detects if bechmark / test needed
+                          # TODO: Make this a universal makefile that detects if benchmark / test needed
                           # Sometimes problematic with SM; consider gating detection.
                           #"\"${WOLFSSL_ROOT}/wolfcrypt/benchmark\"" # the benchmark application
                           #"\"${WOLFSSL_ROOT}/wolfcrypt/test\"" # the test application
diff --git a/IDE/Espressif/ESP-IDF/examples/wolfssl_server/components/wolfssl/CMakeLists.txt b/IDE/Espressif/ESP-IDF/examples/wolfssl_server/components/wolfssl/CMakeLists.txt
index b5ee75c613..2f1e9e4111 100644
--- a/IDE/Espressif/ESP-IDF/examples/wolfssl_server/components/wolfssl/CMakeLists.txt
+++ b/IDE/Espressif/ESP-IDF/examples/wolfssl_server/components/wolfssl/CMakeLists.txt
@@ -206,7 +206,7 @@ else()
                           "\"${WOLFSSL_ROOT}/wolfcrypt/src\""
                           "\"${WOLFSSL_ROOT}/wolfcrypt/src/port/Espressif\""
                           "\"${WOLFSSL_ROOT}/wolfcrypt/src/port/atmel\""
-                          # TODO: Make this a univeral makefile that detects if bechmark / test needed
+                          # TODO: Make this a universal makefile that detects if benchmark / test needed
                           # Sometimes problematic with SM; consider gating detection.
                           #"\"${WOLFSSL_ROOT}/wolfcrypt/benchmark\"" # the benchmark application
                           #"\"${WOLFSSL_ROOT}/wolfcrypt/test\"" # the test application
diff --git a/IDE/Espressif/ESP-IDF/examples/wolfssl_test/main/main.c b/IDE/Espressif/ESP-IDF/examples/wolfssl_test/main/main.c
index 63aaaf27e4..bcf220d8e2 100644
--- a/IDE/Espressif/ESP-IDF/examples/wolfssl_test/main/main.c
+++ b/IDE/Espressif/ESP-IDF/examples/wolfssl_test/main/main.c
@@ -241,6 +241,6 @@ void app_main(void)
 #else
         vTaskDelay(60000);
 #endif
-    } /* done whle */
+    } /* done while */
 #endif
 }
diff --git a/IDE/Renesas/cs+/Projects/t4_demo/README_en.txt b/IDE/Renesas/cs+/Projects/t4_demo/README_en.txt
index 492d2c7f37..6e2a3bcc44 100644
--- a/IDE/Renesas/cs+/Projects/t4_demo/README_en.txt
+++ b/IDE/Renesas/cs+/Projects/t4_demo/README_en.txt
@@ -12,7 +12,7 @@ Setup process:
   - Unzip wolfssl under the same directory
  
 2. Set up wolfSSL
-  - open wolfssl\IDE\Renesas\cs+\Projec/wolfssl\lib.mtpj with CS+ and build
+  - open wolfssl\IDE\Renesas\cs+\Projects\wolfssl\lib.mtpj with CS+ and build
   - open t4_demo.mtpj and build. This create demo program library.
 
 3. Set up AlphaProject
diff --git a/IDE/Renesas/e2studio/RZN2L/README.md b/IDE/Renesas/e2studio/RZN2L/README.md
index 8433702da4..55df4f2c15 100644
--- a/IDE/Renesas/e2studio/RZN2L/README.md
+++ b/IDE/Renesas/e2studio/RZN2L/README.md
@@ -152,7 +152,7 @@ $./examples/server/server -b -d -i -v 4
 
 + For ECDSA sign and verify use,
 Enable the `USE_CERT_BUFFER_256` macro in `wolfssl_demo.h`
-Disble the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
+Disable the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
 
 + launch server with the following option.
 ```
@@ -214,7 +214,7 @@ $./examples/server/server -b -d -i -v 3
 
 + For ECDSA sign and verify use,
 Enable the `USE_CERT_BUFFER_256` macro in `wolfssl_demo.h`
-Disble the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
+Disable the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
 
 + launch server with the following option.
 ```
@@ -281,7 +281,7 @@ static const byte ucIPAddress[4]          = { 192, 168, 11, 241 };
 
 + For ECDSA sign and verify use,
 Enable the `USE_CERT_BUFFER_256` macro in `wolfssl_demo.h`
-Disble the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
+Disable the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
 
 + launch server from e2studio
 
@@ -311,7 +311,7 @@ Cleaning up socket and wolfSSL objects.
 Waiting connection....
 ```
 
-You will see the follwoing message on Linux terminal.
+You will see the following message on Linux terminal.
 ```
 $ ./examples/client/client -h 192.168.11.241 -p 11111 -v 4
 SSL version is TLSv1.3
@@ -333,7 +333,7 @@ Received: hello wolfssl!
 Cleaning up socket and wolfSSL objects.
 Waiting connection....
 ```
-You will see the follwoing message on Linux terminal.
+You will see the following message on Linux terminal.
 ```
 $ ./examples/client/client -h 192.168.11.241 -p 11111 -v 4 -A ./certs/ca-ecc-cert.pem -c ./certs/client-ecc-cert.pem  -k ./cert
 s/ecc-client-key.pem
@@ -359,7 +359,7 @@ static const byte ucIPAddress[4]          = { 192, 168, 11, 241 };
 
 + For ECDSA sign and verify use,
 Enable the `USE_CERT_BUFFER_256` macro in `wolfssl_demo.h`
-Disble the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
+Disable the `USE_CERT_BUFFER_2048` macro in `wolfssl_demo.h`
 
 + launch server from e2studio
 
@@ -389,7 +389,7 @@ Cleaning up socket and wolfSSL objects.
 Waiting connection....
 ```
 
-You will see the follwoing message on Linux terminal.
+You will see the following message on Linux terminal.
 ```
 $ ./examples/client/client -h 192.168.11.241 -p 11111 -v 3
 SSL version is TLSv1.2
@@ -411,7 +411,7 @@ Received: hello wolfssl!
 Cleaning up socket and wolfSSL objects.
 Waiting connection....
 ```
-You will see the follwoing message on Linux terminal.
+You will see the following message on Linux terminal.
 ```
 $ ./examples/client/client -h 192.168.11.241 -p 11111 -v 3 -A ./certs/ca-ecc-cert.pem -c ./certs/client-ecc-cert.pem  -k ./certs/ecc-client-key.pem
 SSL version is TLSv1.2
diff --git a/IDE/Renesas/e2studio/RZN2L/test/src/rzn2l_tst_thread_entry.c b/IDE/Renesas/e2studio/RZN2L/test/src/rzn2l_tst_thread_entry.c
index 14152aa825..0e4c459ed0 100644
--- a/IDE/Renesas/e2studio/RZN2L/test/src/rzn2l_tst_thread_entry.c
+++ b/IDE/Renesas/e2studio/RZN2L/test/src/rzn2l_tst_thread_entry.c
@@ -131,7 +131,7 @@ void RSIP_KeyGeneration(FSPSM_ST *g)
 }
 
 /* only pointer sets to NULL     */
-/* onwer of keys should be freed */
+/* owner of keys should be freed */
 void Clr_CallbackCtx(FSPSM_ST *g)
 {
     (void) g;
diff --git a/README b/README
index f645c3e8d4..ff86035e43 100644
--- a/README
+++ b/README
@@ -95,7 +95,7 @@ NOTE: * --enable-heapmath is being deprecated and will be removed by 2024
 * Added LMS/HSS and XMSS/XMSS^MT wolfcrypt hooks, both normal and verify-only options.
 * Added support for the AES EAX mode of operation
 * Port for use with Hitch (https://github.com/varnish/hitch) added
-* Add XTS API's to handle multiple sectors in new port ot VeraCrypt
+* Add XTS API's to handle multiple sectors in new port to VeraCrypt
 
 ## Enhancements and Optimizations
 
diff --git a/README.md b/README.md
index 0472d6bd9a..c16642b9cc 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ NOTE: * --enable-heapmath is being deprecated and will be removed by 2024
 * Added LMS/HSS and XMSS/XMSS^MT wolfcrypt hooks, both normal and verify-only options.
 * Added support for the AES EAX mode of operation
 * Port for use with Hitch (https://github.com/varnish/hitch) added
-* Add XTS API's to handle multiple sectors in new port ot VeraCrypt
+* Add XTS API's to handle multiple sectors in new port to VeraCrypt
 
 ## Enhancements and Optimizations
 
diff --git a/cmake/functions.cmake b/cmake/functions.cmake
index 47ab832653..329a386eda 100644
--- a/cmake/functions.cmake
+++ b/cmake/functions.cmake
@@ -53,7 +53,7 @@ function(generate_build_flags)
     if(WOLFSSL_SCTP OR WOLFSSL_USER_SETTINGS)
         set(BUILD_SCTP "yes" PARENT_SCOPE)
     endif()
-    if(WOLFSSL_DTLS_CID OR WOLFSSL_USER_SETTINGS)
+    if(WOLFSSL_DTLS_CID OR WOLFSSL_USER_SETTINGS OR WOLFSSL_DTLS)
         set(BUILD_DTLS_COMMON "yes" PARENT_SCOPE)
     endif()
     set(BUILD_MCAST ${WOLFSSL_MCAST} PARENT_SCOPE)
diff --git a/doc/dox_comments/header_files/aes.h b/doc/dox_comments/header_files/aes.h
index bbdee91e56..5f1610f76e 100644
--- a/doc/dox_comments/header_files/aes.h
+++ b/doc/dox_comments/header_files/aes.h
@@ -1533,7 +1533,7 @@ WOLFSSL_API int wc_AesEaxEncryptFinal(AesEax* eax,
     \ref wc_AesEaxInit. When done using the \c AesEax context structure, make 
     sure to free it using \ref wc_AesEaxFree.
 
-    \return 0 if data is authenticated succesfully
+    \return 0 if data is authenticated successfully
     \return AES_EAX_AUTH_E if the authentication tag does not match the
     supplied authentication code vector \c authIn
     \return other error code on failure
diff --git a/examples/server/server.h b/examples/server/server.h
index dbf492b13d..e0c8ad7bab 100644
--- a/examples/server/server.h
+++ b/examples/server/server.h
@@ -27,7 +27,7 @@
 THREAD_RETURN WOLFSSL_THREAD server_test(void* args);
 
 /* Echo bytes using buffer of blockSize until [echoData] bytes are complete. */
-/* If [bechmarkThroughput] set the statistcs will be output at the end */
+/* If [benchmarkThroughput] set the statistics will be output at the end */
 int ServerEchoData(WOLFSSL* ssl, int clientfd, int echoData, int blockSize,
                    size_t benchmarkThroughput);
 
diff --git a/src/internal.c b/src/internal.c
index 4dae1b960c..cd478b9a03 100644
--- a/src/internal.c
+++ b/src/internal.c
@@ -20394,7 +20394,7 @@ int ProcessReplyEx(WOLFSSL* ssl, int allowSocketErr)
         case getRecordLayerHeader:
 
             /* DTLSv1.3 record numbers in the header are encrypted, and AAD
-             * uses the unecrypted form. Because of this we need to modify the
+             * uses the unencrypted form. Because of this we need to modify the
              * header, decrypting the numbers inside
              * DtlsParseUnifiedRecordLayer(). This violates the const attribute
              * of the buffer parameter of GetRecordHeader() used here. */
diff --git a/src/ssl.c b/src/ssl.c
index 2d56dcf85d..49547acf5e 100644
--- a/src/ssl.c
+++ b/src/ssl.c
@@ -8432,7 +8432,7 @@ static int LoadSystemCaCertsWindows(WOLFSSL_CTX* ctx, byte* loaded)
  * directly into wolfSSL "the old way".
  *
  * As of MacOS 14.0 we are still able to use this method to access system
- * certificates. Accessiblity of this API is indicated by the presence of the
+ * certificates. Accessibility of this API is indicated by the presence of the
  * Security/SecTrustSettings.h header. In the likely event that Apple removes
  * access to this API on Macs, this function should be removed and the
  * DoAppleNativeCertValidation() routine should be used for all devices.
@@ -8579,7 +8579,7 @@ int wolfSSL_CTX_load_system_CA_certs(WOLFSSL_CTX* ctx)
 #if defined(HAVE_SECURITY_SECTRUSTSETTINGS_H) \
   && !defined(WOLFSSL_APPLE_NATIVE_CERT_VALIDATION)
     /* As of MacOS 14.0 we are still able to access system certificates and
-     * load them manually into wolfSSL "the old way". Accessiblity of this API
+     * load them manually into wolfSSL "the old way". Accessibility of this API
      * is indicated by the presence of the Security/SecTrustSettings.h header */
     ret = LoadSystemCaCertsMac(ctx, &loaded);
 #elif defined(WOLFSSL_APPLE_NATIVE_CERT_VALIDATION)
diff --git a/src/ssl_crypto.c b/src/ssl_crypto.c
index b7ff5a19e5..063d1eafc2 100644
--- a/src/ssl_crypto.c
+++ b/src/ssl_crypto.c
@@ -1616,8 +1616,8 @@ WOLFSSL_HMAC_CTX* wolfSSL_HMAC_CTX_new(void)
  *
  * Not an OpenSSL compatibility API.
  *
- * @param [in, out] ctx  HMAC contect object.
- * @return  1 inficating success.
+ * @param [in, out] ctx  HMAC context object.
+ * @return  1 indicating success.
  */
 int wolfSSL_HMAC_CTX_Init(WOLFSSL_HMAC_CTX* ctx)
 {
diff --git a/src/tls.c b/src/tls.c
index 9a42a3912c..eaa06a18b0 100644
--- a/src/tls.c
+++ b/src/tls.c
@@ -8396,7 +8396,7 @@ static int TLSX_KeyShare_ProcessPqc(WOLFSSL* ssl, KeyShareEntry* keyShareEntry)
     ret = kyber_id2type(oqs_group, &type);
     if (ret != 0) {
         WOLFSSL_MSG("Invalid OQS algorithm specified.");
-        ret = BAD_FUNC_ARG;
+        return BAD_FUNC_ARG;
     }
     if (ret == 0) {
         ret = wc_KyberKey_Init(type, kem, ssl->heap, INVALID_DEVID);
@@ -8887,7 +8887,7 @@ static int server_generate_pqc_ciphertext(WOLFSSL* ssl,
     ret = kyber_id2type(oqs_group, &type);
     if (ret != 0) {
         WOLFSSL_MSG("Invalid Kyber algorithm specified.");
-        ret = BAD_FUNC_ARG;
+        return BAD_FUNC_ARG;
     }
 
     if (ret == 0) {
diff --git a/src/x509.c b/src/x509.c
index a7b512bd7f..a1c4fc4dbe 100644
--- a/src/x509.c
+++ b/src/x509.c
@@ -5218,7 +5218,7 @@ static WOLFSSL_X509* loadX509orX509REQFromBuffer(
     const unsigned char* buf, int sz, int format, int type)
 {
 
-    int ret;
+    int ret = 0;
     WOLFSSL_X509* x509 = NULL;
     DerBuffer* der = NULL;
 
@@ -5226,7 +5226,8 @@ static WOLFSSL_X509* loadX509orX509REQFromBuffer(
 
     if (format == WOLFSSL_FILETYPE_PEM) {
     #ifdef WOLFSSL_PEM_TO_DER
-        if (PemToDer(buf, sz, type, &der, NULL, NULL, NULL) != 0) {
+        ret = PemToDer(buf, sz, type, &der, NULL, NULL, NULL);
+        if (ret != 0) {
             FreeDer(&der);
         }
     #else
@@ -5252,20 +5253,28 @@ static WOLFSSL_X509* loadX509orX509REQFromBuffer(
     #ifdef WOLFSSL_SMALL_STACK
         cert = (DecodedCert*)XMALLOC(sizeof(DecodedCert), NULL,
                                      DYNAMIC_TYPE_DCERT);
-        if (cert != NULL)
+        if (cert == NULL) {
+            ret = MEMORY_ERROR;
+        }
+        else
     #endif
         {
             InitDecodedCert(cert, der->buffer, der->length, NULL);
-            if (ParseCertRelative(cert, type, 0, NULL) == 0) {
+            ret = ParseCertRelative(cert, type, 0, NULL);
+            if (ret == 0) {
                 x509 = (WOLFSSL_X509*)XMALLOC(sizeof(WOLFSSL_X509), NULL,
                                                              DYNAMIC_TYPE_X509);
                 if (x509 != NULL) {
                     InitX509(x509, 1, NULL);
-                    if (CopyDecodedToX509(x509, cert) != 0) {
+                    ret = CopyDecodedToX509(x509, cert);
+                    if (ret != 0) {
                         wolfSSL_X509_free(x509);
                         x509 = NULL;
                     }
                 }
+                else {
+                    ret = MEMORY_ERROR;
+                }
             }
 
             FreeDecodedCert(cert);
@@ -5277,6 +5286,10 @@ static WOLFSSL_X509* loadX509orX509REQFromBuffer(
         FreeDer(&der);
     }
 
+    if (ret != 0) {
+        WOLFSSL_ERROR(ret);
+    }
+
     return x509;
 }
 
diff --git a/tests/api.c b/tests/api.c
index 2b35b13bf9..5c384882fb 100644
--- a/tests/api.c
+++ b/tests/api.c
@@ -35690,7 +35690,7 @@ static int test_X509_STORE_untrusted(void)
         NULL
     };
 
-    /* Only immediate issuer in untrusted chaing. Fails since can't build chain
+    /* Only immediate issuer in untrusted chain. Fails since can't build chain
      * to loaded CA. */
     ExpectIntEQ(test_X509_STORE_untrusted_certs(untrusted1, 0,
             X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY, 1), TEST_SUCCESS);
diff --git a/wolfcrypt/src/ext_lms.c b/wolfcrypt/src/ext_lms.c
index ebd07e26c5..a5155076b2 100644
--- a/wolfcrypt/src/ext_lms.c
+++ b/wolfcrypt/src/ext_lms.c
@@ -231,7 +231,7 @@ const char * wc_LmsKey_RcToStr(enum wc_LmsRc lmsEc)
 
 /* Init an LMS key.
  *
- * Call this before setting the parms of an LMS key.
+ * Call this before setting the params of an LMS key.
  *
  * Returns 0 on success.
  * */
@@ -404,7 +404,7 @@ int wc_LmsKey_SetParameters(LmsKey * key, int levels, int height,
         key->lm_ots_type[i] = ots;
     }
 
-    /* Move the state to parms set.
+    /* Move the state to params set.
      * Key is ready for MakeKey or Reload. */
     key->state = WC_LMS_STATE_PARMSET;
 
@@ -656,7 +656,7 @@ int wc_LmsKey_MakeKey(LmsKey* key, WC_RNG * rng)
     return 0;
 }
 
-/* Reload a key that has been prepared with the appropriate parms and
+/* Reload a key that has been prepared with the appropriate params and
  * data. Use this if you wish to resume signing with an existing key.
  *
  * Write/read callbacks, and context data, must be set prior.
diff --git a/wolfcrypt/src/ext_xmss.c b/wolfcrypt/src/ext_xmss.c
index c19e95e916..b1e5e46dd4 100644
--- a/wolfcrypt/src/ext_xmss.c
+++ b/wolfcrypt/src/ext_xmss.c
@@ -97,7 +97,7 @@ static int sha256_cb(const unsigned char *in, unsigned long long inlen,
 
 /* Init an XMSS key.
  *
- * Call this before setting the parms of an XMSS key.
+ * Call this before setting the params of an XMSS key.
  *
  *  key         [in]  The XMSS key to init.
  *  heap        [in]  Unused.
@@ -201,7 +201,7 @@ static int wc_XmssKey_SetOid(XmssKey * key, uint32_t oid, int is_xmssmt)
 
 /* Set the XMSS key parameter string.
  *
- * The input string must be one of the supported parm set names in
+ * The input string must be one of the supported param set names in
  * the "Name" section from the table in wolfssl/wolfcrypt/xmss.h,
  * e.g. "XMSS-SHA2_10_256" or "XMSSMT-SHA2_20/4_256".
  *
diff --git a/wolfcrypt/src/port/Espressif/esp32_mp.c b/wolfcrypt/src/port/Espressif/esp32_mp.c
index 066c61f685..1b699dfddf 100644
--- a/wolfcrypt/src/port/Espressif/esp32_mp.c
+++ b/wolfcrypt/src/port/Espressif/esp32_mp.c
@@ -1015,8 +1015,8 @@ int esp_mp_montgomery_init(MATH_INT_T* X, MATH_INT_T* Y, MATH_INT_T* M,
         return MP_HW_FALLBACK;
     }
     if ((X == NULL) || (Y == NULL) || (M == NULL) ) {
-        /* if a bad oprand passed, we cannot use HW */
-        ESP_LOGE(TAG, "ERROR: Bad Montgomery operand, falling back to SW");
+        /* if a bad operand passed, we cannot use HW */
+        ESP_LOGE(TAG, "ERROR: Bad montgomery operand, falling back to SW");
         return MP_HW_FALLBACK;
     }
     XMEMSET(mph, 0, sizeof(struct esp_mp_helper));
diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c
index 989e65bd25..2efc3d35d8 100644
--- a/wolfcrypt/src/port/arm/armv8-aes.c
+++ b/wolfcrypt/src/port/arm/armv8-aes.c
@@ -1882,7 +1882,7 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     byte counter[AES_BLOCK_SIZE];
     byte scratch[AES_BLOCK_SIZE];
     /* Noticed different optimization levels treated head of array different.
-     * Some cases was stack pointer plus offset others was a regester containing
+     * Some cases was stack pointer plus offset others was a register containing
      * address. To make uniform for passing in to inline assembly code am using
      * pointers to the head of each local array.
      */
@@ -3528,7 +3528,7 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     byte counter[AES_BLOCK_SIZE];
     byte scratch[AES_BLOCK_SIZE];
     /* Noticed different optimization levels treated head of array different.
-     * Some cases was stack pointer plus offset others was a regester containing
+     * Some cases was stack pointer plus offset others was a register containing
      * address. To make uniform for passing in to inline assembly code am using
      * pointers to the head of each local array.
      */
@@ -5291,7 +5291,7 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     byte counter[AES_BLOCK_SIZE];
     byte scratch[AES_BLOCK_SIZE];
     /* Noticed different optimization levels treated head of array different.
-     * Some cases was stack pointer plus offset others was a regester containing
+     * Some cases was stack pointer plus offset others was a register containing
      * address. To make uniform for passing in to inline assembly code am using
      * pointers to the head of each local array.
      */
diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm.S b/wolfcrypt/src/port/arm/thumb2-aes-asm.S
index f483f87de4..0badf8f97f 100644
--- a/wolfcrypt/src/port/arm/thumb2-aes-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-aes-asm.S
@@ -670,13 +670,13 @@ L_AES_invert_key_mix_loop:
 	EOR	r8, r8, r9, ROR #24
 	STR	r8, [r0], #4
 	SUBS	r11, r11, #0x1
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_invert_key_mix_loop
 #else
-	BNE.N	L_AES_invert_key_mix_loop
+	BNE.W	L_AES_invert_key_mix_loop
 #endif
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 165
+	/* Cycle Count = 165 */
 	.size	AES_invert_key,.-AES_invert_key
 #endif /* HAVE_AES_DECRYPT */
 	.text
@@ -699,20 +699,20 @@ L_AES_Thumb2_rcon:
 	.globl	AES_set_encrypt_key
 	.type	AES_set_encrypt_key, %function
 AES_set_encrypt_key:
-	PUSH	{r4, r5, r6, r7, r8, lr}
-	LDR	r8, L_AES_Thumb2_te
+	PUSH	{r4, r5, r6, r7, r8, r9, r10, lr}
+	LDR	r10, L_AES_Thumb2_te
 	ADR	lr, L_AES_Thumb2_rcon
 	CMP	r1, #0x80
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_set_encrypt_key_start_128
 #else
-	BEQ.N	L_AES_set_encrypt_key_start_128
+	BEQ.W	L_AES_set_encrypt_key_start_128
 #endif
 	CMP	r1, #0xc0
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_set_encrypt_key_start_192
 #else
-	BEQ.N	L_AES_set_encrypt_key_start_192
+	BEQ.W	L_AES_set_encrypt_key_start_192
 #endif
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
@@ -735,10 +735,10 @@ L_AES_set_encrypt_key_loop_256:
 	UBFX	r5, r7, #8, #8
 	UBFX	r6, r7, #16, #8
 	LSR	r7, r7, #24
-	LDRB	r4, [r8, r4, LSL #2]
-	LDRB	r5, [r8, r5, LSL #2]
-	LDRB	r6, [r8, r6, LSL #2]
-	LDRB	r7, [r8, r7, LSL #2]
+	LDRB	r4, [r10, r4, LSL #2]
+	LDRB	r5, [r10, r5, LSL #2]
+	LDRB	r6, [r10, r6, LSL #2]
+	LDRB	r7, [r10, r7, LSL #2]
 	EOR	r3, r7, r4, LSL #8
 	EOR	r3, r3, r5, LSL #16
 	EOR	r3, r3, r6, LSL #24
@@ -757,10 +757,10 @@ L_AES_set_encrypt_key_loop_256:
 	UBFX	r5, r3, #16, #8
 	LSR	r6, r3, #24
 	UBFX	r3, r3, #0, #8
-	LDRB	r4, [r8, r4, LSL #2]
-	LDRB	r6, [r8, r6, LSL #2]
-	LDRB	r5, [r8, r5, LSL #2]
-	LDRB	r3, [r8, r3, LSL #2]
+	LDRB	r4, [r10, r4, LSL #2]
+	LDRB	r6, [r10, r6, LSL #2]
+	LDRB	r5, [r10, r5, LSL #2]
+	LDRB	r3, [r10, r3, LSL #2]
 	EOR	r3, r3, r4, LSL #8
 	EOR	r3, r3, r5, LSL #16
 	EOR	r3, r3, r6, LSL #24
@@ -782,10 +782,10 @@ L_AES_set_encrypt_key_loop_256:
 	UBFX	r5, r7, #8, #8
 	UBFX	r6, r7, #16, #8
 	LSR	r7, r7, #24
-	LDRB	r4, [r8, r4, LSL #2]
-	LDRB	r5, [r8, r5, LSL #2]
-	LDRB	r6, [r8, r6, LSL #2]
-	LDRB	r7, [r8, r7, LSL #2]
+	LDRB	r4, [r10, r4, LSL #2]
+	LDRB	r5, [r10, r5, LSL #2]
+	LDRB	r6, [r10, r6, LSL #2]
+	LDRB	r7, [r10, r7, LSL #2]
 	EOR	r3, r7, r4, LSL #8
 	EOR	r3, r3, r5, LSL #16
 	EOR	r3, r3, r6, LSL #24
@@ -799,69 +799,77 @@ L_AES_set_encrypt_key_loop_256:
 	ADD	r2, r2, #0x10
 	STM	r2, {r4, r5, r6, r7}
 	SUB	r2, r2, #0x10
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_set_encrypt_key_end
+#else
+	B.N	L_AES_set_encrypt_key_end
+#endif
 L_AES_set_encrypt_key_start_192:
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
-	LDRD	r0, r1, [r0, #16]
+	LDRD	r8, r9, [r0, #16]
 	REV	r4, r4
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	REV	r0, r0
-	REV	r1, r1
+	REV	r8, r8
+	REV	r9, r9
 	STM	r2, {r4, r5, r6, r7}
-	STRD	r0, r1, [r2, #16]
-	MOV	r7, r1
+	STRD	r8, r9, [r2, #16]
+	MOV	r7, r9
 	MOV	r12, #0x7
 L_AES_set_encrypt_key_loop_192:
-	UBFX	r0, r7, #0, #8
-	UBFX	r1, r7, #8, #8
-	UBFX	r4, r7, #16, #8
-	LSR	r7, r7, #24
-	LDRB	r0, [r8, r0, LSL #2]
-	LDRB	r1, [r8, r1, LSL #2]
-	LDRB	r4, [r8, r4, LSL #2]
-	LDRB	r7, [r8, r7, LSL #2]
-	EOR	r3, r7, r0, LSL #8
-	EOR	r3, r3, r1, LSL #16
-	EOR	r3, r3, r4, LSL #24
-	LDM	r2!, {r0, r1, r4, r5, r6, r7}
-	EOR	r0, r0, r3
+	UBFX	r4, r9, #0, #8
+	UBFX	r5, r9, #8, #8
+	UBFX	r6, r9, #16, #8
+	LSR	r9, r9, #24
+	LDRB	r4, [r10, r4, LSL #2]
+	LDRB	r5, [r10, r5, LSL #2]
+	LDRB	r6, [r10, r6, LSL #2]
+	LDRB	r9, [r10, r9, LSL #2]
+	EOR	r3, r9, r4, LSL #8
+	EOR	r3, r3, r5, LSL #16
+	EOR	r3, r3, r6, LSL #24
+	LDM	r2!, {r4, r5, r6, r7, r8, r9}
+	EOR	r4, r4, r3
 	LDM	lr!, {r3}
-	EOR	r0, r0, r3
-	EOR	r1, r1, r0
-	EOR	r4, r4, r1
+	EOR	r4, r4, r3
 	EOR	r5, r5, r4
 	EOR	r6, r6, r5
 	EOR	r7, r7, r6
-	STM	r2, {r0, r1, r4, r5, r6, r7}
+	EOR	r8, r8, r7
+	EOR	r9, r9, r8
+	STM	r2, {r4, r5, r6, r7, r8, r9}
 	SUBS	r12, r12, #0x1
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	BNE	L_AES_set_encrypt_key_loop_192
 #else
 	BNE.N	L_AES_set_encrypt_key_loop_192
 #endif
-	UBFX	r0, r7, #0, #8
-	UBFX	r1, r7, #8, #8
-	UBFX	r4, r7, #16, #8
-	LSR	r7, r7, #24
-	LDRB	r0, [r8, r0, LSL #2]
-	LDRB	r1, [r8, r1, LSL #2]
-	LDRB	r4, [r8, r4, LSL #2]
-	LDRB	r7, [r8, r7, LSL #2]
-	EOR	r3, r7, r0, LSL #8
-	EOR	r3, r3, r1, LSL #16
-	EOR	r3, r3, r4, LSL #24
-	LDM	r2!, {r0, r1, r4, r5, r6, r7}
-	EOR	r0, r0, r3
+	UBFX	r4, r9, #0, #8
+	UBFX	r5, r9, #8, #8
+	UBFX	r6, r9, #16, #8
+	LSR	r9, r9, #24
+	LDRB	r4, [r10, r4, LSL #2]
+	LDRB	r5, [r10, r5, LSL #2]
+	LDRB	r6, [r10, r6, LSL #2]
+	LDRB	r9, [r10, r9, LSL #2]
+	EOR	r3, r9, r4, LSL #8
+	EOR	r3, r3, r5, LSL #16
+	EOR	r3, r3, r6, LSL #24
+	LDM	r2!, {r4, r5, r6, r7, r8, r9}
+	EOR	r4, r4, r3
 	LDM	lr!, {r3}
-	EOR	r0, r0, r3
-	EOR	r1, r1, r0
-	EOR	r4, r4, r1
+	EOR	r4, r4, r3
 	EOR	r5, r5, r4
-	STM	r2, {r0, r1, r4, r5}
+	EOR	r6, r6, r5
+	EOR	r7, r7, r6
+	STM	r2, {r4, r5, r6, r7}
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_set_encrypt_key_end
+#else
+	B.N	L_AES_set_encrypt_key_end
+#endif
 L_AES_set_encrypt_key_start_128:
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
@@ -876,10 +884,10 @@ L_AES_set_encrypt_key_loop_128:
 	UBFX	r5, r7, #8, #8
 	UBFX	r6, r7, #16, #8
 	LSR	r7, r7, #24
-	LDRB	r4, [r8, r4, LSL #2]
-	LDRB	r5, [r8, r5, LSL #2]
-	LDRB	r6, [r8, r6, LSL #2]
-	LDRB	r7, [r8, r7, LSL #2]
+	LDRB	r4, [r10, r4, LSL #2]
+	LDRB	r5, [r10, r5, LSL #2]
+	LDRB	r6, [r10, r6, LSL #2]
+	LDRB	r7, [r10, r7, LSL #2]
 	EOR	r3, r7, r4, LSL #8
 	EOR	r3, r3, r5, LSL #16
 	EOR	r3, r3, r6, LSL #24
@@ -898,8 +906,8 @@ L_AES_set_encrypt_key_loop_128:
 	BNE.N	L_AES_set_encrypt_key_loop_128
 #endif
 L_AES_set_encrypt_key_end:
-	POP	{r4, r5, r6, r7, r8, pc}
-	# Cycle Count = 327
+	POP	{r4, r5, r6, r7, r8, r9, r10, pc}
+	/* Cycle Count = 331 */
 	.size	AES_set_encrypt_key,.-AES_set_encrypt_key
 	.text
 	.align	4
@@ -953,7 +961,7 @@ L_AES_encrypt_block_nr:
 	LDM	r3!, {r4, r5, r6, r7}
 	EOR	r11, r11, lr, ROR #24
 	EOR	r11, r11, r2, ROR #8
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r8, r8, r4
 	EOR	r9, r9, r5
 	EOR	r10, r10, r6
@@ -1003,16 +1011,16 @@ L_AES_encrypt_block_nr:
 	LDM	r3!, {r8, r9, r10, r11}
 	EOR	r7, r7, lr, ROR #24
 	EOR	r7, r7, r2, ROR #8
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
 	EOR	r7, r7, r11
 	SUBS	r1, r1, #0x1
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_encrypt_block_nr
 #else
-	BNE.N	L_AES_encrypt_block_nr
+	BNE.W	L_AES_encrypt_block_nr
 #endif
 	UBFX	r8, r5, #16, #8
 	LSR	r11, r4, #24
@@ -1059,7 +1067,7 @@ L_AES_encrypt_block_nr:
 	LDM	r3!, {r4, r5, r6, r7}
 	EOR	r11, r11, lr, ROR #24
 	EOR	r11, r11, r2, ROR #8
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r8, r8, r4
 	EOR	r9, r9, r5
 	EOR	r10, r10, r6
@@ -1109,13 +1117,13 @@ L_AES_encrypt_block_nr:
 	LDM	r3, {r8, r9, r10, r11}
 	EOR	r7, r7, lr, LSL #8
 	EOR	r7, r7, r2, LSL #16
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
 	EOR	r7, r7, r11
 	POP	{pc}
-	# Cycle Count = 285
+	/* Cycle Count = 285 */
 	.size	AES_encrypt_block,.-AES_encrypt_block
 #if defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
 	.text
@@ -1137,16 +1145,16 @@ AES_ECB_encrypt:
 	LDR	r12, [sp, #36]
 	PUSH	{r3}
 	CMP	r12, #0xa
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_ECB_encrypt_start_block_128
 #else
-	BEQ.N	L_AES_ECB_encrypt_start_block_128
+	BEQ.W	L_AES_ECB_encrypt_start_block_128
 #endif
 	CMP	r12, #0xc
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_ECB_encrypt_start_block_192
 #else
-	BEQ.N	L_AES_ECB_encrypt_start_block_192
+	BEQ.W	L_AES_ECB_encrypt_start_block_192
 #endif
 L_AES_ECB_encrypt_loop_block_256:
 	LDR	r4, [lr]
@@ -1159,7 +1167,7 @@ L_AES_ECB_encrypt_loop_block_256:
 	REV	r7, r7
 	PUSH	{r1, r2, lr}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1179,12 +1187,16 @@ L_AES_ECB_encrypt_loop_block_256:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_ECB_encrypt_loop_block_256
 #else
-	BNE.N	L_AES_ECB_encrypt_loop_block_256
+	BNE.W	L_AES_ECB_encrypt_loop_block_256
 #endif
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_ECB_encrypt_end
+#else
+	B.N	L_AES_ECB_encrypt_end
+#endif
 L_AES_ECB_encrypt_start_block_192:
 L_AES_ECB_encrypt_loop_block_192:
 	LDR	r4, [lr]
@@ -1197,7 +1209,7 @@ L_AES_ECB_encrypt_loop_block_192:
 	REV	r7, r7
 	PUSH	{r1, r2, lr}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1217,12 +1229,16 @@ L_AES_ECB_encrypt_loop_block_192:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_ECB_encrypt_loop_block_192
 #else
-	BNE.N	L_AES_ECB_encrypt_loop_block_192
+	BNE.W	L_AES_ECB_encrypt_loop_block_192
 #endif
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_ECB_encrypt_end
+#else
+	B.N	L_AES_ECB_encrypt_end
+#endif
 L_AES_ECB_encrypt_start_block_128:
 L_AES_ECB_encrypt_loop_block_128:
 	LDR	r4, [lr]
@@ -1235,7 +1251,7 @@ L_AES_ECB_encrypt_loop_block_128:
 	REV	r7, r7
 	PUSH	{r1, r2, lr}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1255,15 +1271,15 @@ L_AES_ECB_encrypt_loop_block_128:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_ECB_encrypt_loop_block_128
 #else
-	BNE.N	L_AES_ECB_encrypt_loop_block_128
+	BNE.W	L_AES_ECB_encrypt_loop_block_128
 #endif
 L_AES_ECB_encrypt_end:
 	POP	{r3}
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 212
+	/* Cycle Count = 212 */
 	.size	AES_ECB_encrypt,.-AES_ECB_encrypt
 #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_CBC
@@ -1280,16 +1296,16 @@ AES_CBC_encrypt:
 	LDM	r9, {r4, r5, r6, r7}
 	PUSH	{r3, r9}
 	CMP	r8, #0xa
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CBC_encrypt_start_block_128
 #else
-	BEQ.N	L_AES_CBC_encrypt_start_block_128
+	BEQ.W	L_AES_CBC_encrypt_start_block_128
 #endif
 	CMP	r8, #0xc
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CBC_encrypt_start_block_192
 #else
-	BEQ.N	L_AES_CBC_encrypt_start_block_192
+	BEQ.W	L_AES_CBC_encrypt_start_block_192
 #endif
 L_AES_CBC_encrypt_loop_block_256:
 	LDR	r8, [lr]
@@ -1306,7 +1322,7 @@ L_AES_CBC_encrypt_loop_block_256:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1326,12 +1342,16 @@ L_AES_CBC_encrypt_loop_block_256:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CBC_encrypt_loop_block_256
 #else
-	BNE.N	L_AES_CBC_encrypt_loop_block_256
+	BNE.W	L_AES_CBC_encrypt_loop_block_256
 #endif
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_CBC_encrypt_end
+#else
+	B.N	L_AES_CBC_encrypt_end
+#endif
 L_AES_CBC_encrypt_start_block_192:
 L_AES_CBC_encrypt_loop_block_192:
 	LDR	r8, [lr]
@@ -1348,7 +1368,7 @@ L_AES_CBC_encrypt_loop_block_192:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1368,12 +1388,16 @@ L_AES_CBC_encrypt_loop_block_192:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CBC_encrypt_loop_block_192
 #else
-	BNE.N	L_AES_CBC_encrypt_loop_block_192
+	BNE.W	L_AES_CBC_encrypt_loop_block_192
 #endif
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_CBC_encrypt_end
+#else
+	B.N	L_AES_CBC_encrypt_end
+#endif
 L_AES_CBC_encrypt_start_block_128:
 L_AES_CBC_encrypt_loop_block_128:
 	LDR	r8, [lr]
@@ -1390,7 +1414,7 @@ L_AES_CBC_encrypt_loop_block_128:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1410,16 +1434,16 @@ L_AES_CBC_encrypt_loop_block_128:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CBC_encrypt_loop_block_128
 #else
-	BNE.N	L_AES_CBC_encrypt_loop_block_128
+	BNE.W	L_AES_CBC_encrypt_loop_block_128
 #endif
 L_AES_CBC_encrypt_end:
 	POP	{r3, r9}
 	STM	r9, {r4, r5, r6, r7}
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 238
+	/* Cycle Count = 238 */
 	.size	AES_CBC_encrypt,.-AES_CBC_encrypt
 #endif /* HAVE_AES_CBC */
 #ifdef WOLFSSL_AES_COUNTER
@@ -1441,16 +1465,16 @@ AES_CTR_encrypt:
 	STM	r8, {r4, r5, r6, r7}
 	PUSH	{r3, r8}
 	CMP	r12, #0xa
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CTR_encrypt_start_block_128
 #else
-	BEQ.N	L_AES_CTR_encrypt_start_block_128
+	BEQ.W	L_AES_CTR_encrypt_start_block_128
 #endif
 	CMP	r12, #0xc
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CTR_encrypt_start_block_192
 #else
-	BEQ.N	L_AES_CTR_encrypt_start_block_192
+	BEQ.W	L_AES_CTR_encrypt_start_block_192
 #endif
 L_AES_CTR_encrypt_loop_block_256:
 	PUSH	{r1, r2, lr}
@@ -1461,7 +1485,7 @@ L_AES_CTR_encrypt_loop_block_256:
 	ADC	r8, r4, #0x0
 	STM	lr, {r8, r9, r10, r11}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1491,12 +1515,16 @@ L_AES_CTR_encrypt_loop_block_256:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CTR_encrypt_loop_block_256
 #else
-	BNE.N	L_AES_CTR_encrypt_loop_block_256
+	BNE.W	L_AES_CTR_encrypt_loop_block_256
 #endif
+#ifdef __GNUC__
 	B	L_AES_CTR_encrypt_end
+#else
+	B.W	L_AES_CTR_encrypt_end
+#endif
 L_AES_CTR_encrypt_start_block_192:
 L_AES_CTR_encrypt_loop_block_192:
 	PUSH	{r1, r2, lr}
@@ -1507,7 +1535,7 @@ L_AES_CTR_encrypt_loop_block_192:
 	ADC	r8, r4, #0x0
 	STM	lr, {r8, r9, r10, r11}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1537,12 +1565,16 @@ L_AES_CTR_encrypt_loop_block_192:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CTR_encrypt_loop_block_192
 #else
-	BNE.N	L_AES_CTR_encrypt_loop_block_192
+	BNE.W	L_AES_CTR_encrypt_loop_block_192
 #endif
+#ifdef __GNUC__
 	B	L_AES_CTR_encrypt_end
+#else
+	B.W	L_AES_CTR_encrypt_end
+#endif
 L_AES_CTR_encrypt_start_block_128:
 L_AES_CTR_encrypt_loop_block_128:
 	PUSH	{r1, r2, lr}
@@ -1553,7 +1585,7 @@ L_AES_CTR_encrypt_loop_block_128:
 	ADC	r8, r4, #0x0
 	STM	lr, {r8, r9, r10, r11}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -1583,10 +1615,10 @@ L_AES_CTR_encrypt_loop_block_128:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CTR_encrypt_loop_block_128
 #else
-	BNE.N	L_AES_CTR_encrypt_loop_block_128
+	BNE.W	L_AES_CTR_encrypt_loop_block_128
 #endif
 L_AES_CTR_encrypt_end:
 	POP	{r3, r8}
@@ -1596,7 +1628,7 @@ L_AES_CTR_encrypt_end:
 	REV	r7, r7
 	STM	r8, {r4, r5, r6, r7}
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 293
+	/* Cycle Count = 293 */
 	.size	AES_CTR_encrypt,.-AES_CTR_encrypt
 #endif /* WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_DECRYPT
@@ -1653,7 +1685,7 @@ L_AES_decrypt_block_nr:
 	LDM	r3!, {r4, r5, r6, r7}
 	EOR	r11, r11, lr, ROR #8
 	EOR	r11, r11, r12, ROR #24
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r8, r8, r4
 	EOR	r9, r9, r5
 	EOR	r10, r10, r6
@@ -1703,16 +1735,16 @@ L_AES_decrypt_block_nr:
 	LDM	r3!, {r8, r9, r10, r11}
 	EOR	r7, r7, lr, ROR #8
 	EOR	r7, r7, r12, ROR #24
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
 	EOR	r7, r7, r11
 	SUBS	r1, r1, #0x1
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_decrypt_block_nr
 #else
-	BNE.N	L_AES_decrypt_block_nr
+	BNE.W	L_AES_decrypt_block_nr
 #endif
 	UBFX	r8, r7, #16, #8
 	LSR	r11, r4, #24
@@ -1759,7 +1791,7 @@ L_AES_decrypt_block_nr:
 	LDM	r3!, {r4, r5, r6, r7}
 	EOR	r11, r11, lr, ROR #8
 	EOR	r11, r11, r12, ROR #24
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r8, r8, r4
 	EOR	r9, r9, r5
 	EOR	r10, r10, r6
@@ -1809,13 +1841,13 @@ L_AES_decrypt_block_nr:
 	LDM	r3, {r8, r9, r10, r11}
 	EOR	r7, r7, r12, LSL #8
 	EOR	r7, r7, lr, LSL #16
-	#   XOR in Key Schedule
+	/*   XOR in Key Schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
 	EOR	r7, r7, r11
 	POP	{pc}
-	# Cycle Count = 285
+	/* Cycle Count = 285 */
 	.size	AES_decrypt_block,.-AES_decrypt_block
 	.text
 	.type	L_AES_Thumb2_td_ecb, %object
@@ -2097,16 +2129,16 @@ AES_ECB_decrypt:
 	MOV	r12, r2
 	ADR	r2, L_AES_Thumb2_td4
 	CMP	r8, #0xa
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_ECB_decrypt_start_block_128
 #else
-	BEQ.N	L_AES_ECB_decrypt_start_block_128
+	BEQ.W	L_AES_ECB_decrypt_start_block_128
 #endif
 	CMP	r8, #0xc
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_ECB_decrypt_start_block_192
 #else
-	BEQ.N	L_AES_ECB_decrypt_start_block_192
+	BEQ.W	L_AES_ECB_decrypt_start_block_192
 #endif
 L_AES_ECB_decrypt_loop_block_256:
 	LDR	r4, [lr]
@@ -2119,7 +2151,7 @@ L_AES_ECB_decrypt_loop_block_256:
 	REV	r7, r7
 	PUSH	{r1, r3, r12, lr}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2138,12 +2170,16 @@ L_AES_ECB_decrypt_loop_block_256:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_ECB_decrypt_loop_block_256
 #else
-	BNE.N	L_AES_ECB_decrypt_loop_block_256
+	BNE.W	L_AES_ECB_decrypt_loop_block_256
 #endif
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_ECB_decrypt_end
+#else
+	B.N	L_AES_ECB_decrypt_end
+#endif
 L_AES_ECB_decrypt_start_block_192:
 L_AES_ECB_decrypt_loop_block_192:
 	LDR	r4, [lr]
@@ -2156,7 +2192,7 @@ L_AES_ECB_decrypt_loop_block_192:
 	REV	r7, r7
 	PUSH	{r1, r3, r12, lr}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2175,12 +2211,16 @@ L_AES_ECB_decrypt_loop_block_192:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_ECB_decrypt_loop_block_192
 #else
-	BNE.N	L_AES_ECB_decrypt_loop_block_192
+	BNE.W	L_AES_ECB_decrypt_loop_block_192
 #endif
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_ECB_decrypt_end
+#else
+	B.N	L_AES_ECB_decrypt_end
+#endif
 L_AES_ECB_decrypt_start_block_128:
 L_AES_ECB_decrypt_loop_block_128:
 	LDR	r4, [lr]
@@ -2193,7 +2233,7 @@ L_AES_ECB_decrypt_loop_block_128:
 	REV	r7, r7
 	PUSH	{r1, r3, r12, lr}
 	LDM	r3!, {r8, r9, r10, r11}
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2212,14 +2252,14 @@ L_AES_ECB_decrypt_loop_block_128:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_ECB_decrypt_loop_block_128
 #else
-	BNE.N	L_AES_ECB_decrypt_loop_block_128
+	BNE.W	L_AES_ECB_decrypt_loop_block_128
 #endif
 L_AES_ECB_decrypt_end:
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 210
+	/* Cycle Count = 210 */
 	.size	AES_ECB_decrypt,.-AES_ECB_decrypt
 #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */
 #ifdef HAVE_AES_CBC
@@ -2237,16 +2277,16 @@ AES_CBC_decrypt:
 	ADR	r2, L_AES_Thumb2_td4
 	PUSH	{r3, r4}
 	CMP	r8, #0xa
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CBC_decrypt_loop_block_128
 #else
-	BEQ.N	L_AES_CBC_decrypt_loop_block_128
+	BEQ.W	L_AES_CBC_decrypt_loop_block_128
 #endif
 	CMP	r8, #0xc
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CBC_decrypt_loop_block_192
 #else
-	BEQ.N	L_AES_CBC_decrypt_loop_block_192
+	BEQ.W	L_AES_CBC_decrypt_loop_block_192
 #endif
 L_AES_CBC_decrypt_loop_block_256:
 	PUSH	{r1, r12, lr}
@@ -2262,7 +2302,7 @@ L_AES_CBC_decrypt_loop_block_256:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2288,10 +2328,10 @@ L_AES_CBC_decrypt_loop_block_256:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CBC_decrypt_end_odd
 #else
-	BEQ.N	L_AES_CBC_decrypt_end_odd
+	BEQ.W	L_AES_CBC_decrypt_end_odd
 #endif
 	PUSH	{r1, r12, lr}
 	LDR	r4, [lr]
@@ -2306,7 +2346,7 @@ L_AES_CBC_decrypt_loop_block_256:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2333,12 +2373,16 @@ L_AES_CBC_decrypt_loop_block_256:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CBC_decrypt_loop_block_256
 #else
-	BNE.N	L_AES_CBC_decrypt_loop_block_256
+	BNE.W	L_AES_CBC_decrypt_loop_block_256
 #endif
+#ifdef __GNUC__
 	B	L_AES_CBC_decrypt_end
+#else
+	B.W	L_AES_CBC_decrypt_end
+#endif
 L_AES_CBC_decrypt_loop_block_192:
 	PUSH	{r1, r12, lr}
 	LDR	r4, [lr]
@@ -2353,7 +2397,7 @@ L_AES_CBC_decrypt_loop_block_192:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2379,10 +2423,10 @@ L_AES_CBC_decrypt_loop_block_192:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CBC_decrypt_end_odd
 #else
-	BEQ.N	L_AES_CBC_decrypt_end_odd
+	BEQ.W	L_AES_CBC_decrypt_end_odd
 #endif
 	PUSH	{r1, r12, lr}
 	LDR	r4, [lr]
@@ -2397,7 +2441,7 @@ L_AES_CBC_decrypt_loop_block_192:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2424,12 +2468,16 @@ L_AES_CBC_decrypt_loop_block_192:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CBC_decrypt_loop_block_192
 #else
-	BNE.N	L_AES_CBC_decrypt_loop_block_192
+	BNE.W	L_AES_CBC_decrypt_loop_block_192
 #endif
+#ifdef __GNUC__
 	B	L_AES_CBC_decrypt_end
+#else
+	B.W	L_AES_CBC_decrypt_end
+#endif
 L_AES_CBC_decrypt_loop_block_128:
 	PUSH	{r1, r12, lr}
 	LDR	r4, [lr]
@@ -2444,7 +2492,7 @@ L_AES_CBC_decrypt_loop_block_128:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2470,10 +2518,10 @@ L_AES_CBC_decrypt_loop_block_128:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_CBC_decrypt_end_odd
 #else
-	BEQ.N	L_AES_CBC_decrypt_end_odd
+	BEQ.W	L_AES_CBC_decrypt_end_odd
 #endif
 	PUSH	{r1, r12, lr}
 	LDR	r4, [lr]
@@ -2488,7 +2536,7 @@ L_AES_CBC_decrypt_loop_block_128:
 	REV	r5, r5
 	REV	r6, r6
 	REV	r7, r7
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -2515,12 +2563,16 @@ L_AES_CBC_decrypt_loop_block_128:
 	SUBS	r12, r12, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_CBC_decrypt_loop_block_128
 #else
-	BNE.N	L_AES_CBC_decrypt_loop_block_128
+	BNE.W	L_AES_CBC_decrypt_loop_block_128
 #endif
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
 	B	L_AES_CBC_decrypt_end
+#else
+	B.N	L_AES_CBC_decrypt_end
+#endif
 L_AES_CBC_decrypt_end_odd:
 	LDR	r4, [sp, #4]
 	LDRD	r8, r9, [r4, #16]
@@ -2530,7 +2582,7 @@ L_AES_CBC_decrypt_end_odd:
 L_AES_CBC_decrypt_end:
 	POP	{r3, r4}
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 518
+	/* Cycle Count = 518 */
 	.size	AES_CBC_decrypt,.-AES_CBC_decrypt
 #endif /* HAVE_AES_CBC */
 #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC */
@@ -3109,13 +3161,13 @@ L_GCM_gmult_len_start_block:
 	POP	{r3}
 	SUBS	r3, r3, #0x10
 	ADD	r2, r2, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_GCM_gmult_len_start_block
 #else
-	BNE.N	L_GCM_gmult_len_start_block
+	BNE.W	L_GCM_gmult_len_start_block
 #endif
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 742
+	/* Cycle Count = 742 */
 	.size	GCM_gmult_len,.-GCM_gmult_len
 	.text
 	.type	L_AES_Thumb2_te_gcm, %object
@@ -3141,16 +3193,16 @@ AES_GCM_encrypt:
 	STM	r8, {r4, r5, r6, r7}
 	PUSH	{r3, r8}
 	CMP	r12, #0xa
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_GCM_encrypt_start_block_128
 #else
-	BEQ.N	L_AES_GCM_encrypt_start_block_128
+	BEQ.W	L_AES_GCM_encrypt_start_block_128
 #endif
 	CMP	r12, #0xc
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BEQ	L_AES_GCM_encrypt_start_block_192
 #else
-	BEQ.N	L_AES_GCM_encrypt_start_block_192
+	BEQ.W	L_AES_GCM_encrypt_start_block_192
 #endif
 L_AES_GCM_encrypt_loop_block_256:
 	PUSH	{r1, r2, lr}
@@ -3158,7 +3210,7 @@ L_AES_GCM_encrypt_loop_block_256:
 	ADD	r7, r7, #0x1
 	LDM	r3!, {r8, r9, r10, r11}
 	STR	r7, [lr, #12]
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -3188,12 +3240,16 @@ L_AES_GCM_encrypt_loop_block_256:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_GCM_encrypt_loop_block_256
 #else
-	BNE.N	L_AES_GCM_encrypt_loop_block_256
+	BNE.W	L_AES_GCM_encrypt_loop_block_256
 #endif
+#ifdef __GNUC__
 	B	L_AES_GCM_encrypt_end
+#else
+	B.W	L_AES_GCM_encrypt_end
+#endif
 L_AES_GCM_encrypt_start_block_192:
 L_AES_GCM_encrypt_loop_block_192:
 	PUSH	{r1, r2, lr}
@@ -3201,7 +3257,7 @@ L_AES_GCM_encrypt_loop_block_192:
 	ADD	r7, r7, #0x1
 	LDM	r3!, {r8, r9, r10, r11}
 	STR	r7, [lr, #12]
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -3231,12 +3287,16 @@ L_AES_GCM_encrypt_loop_block_192:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_GCM_encrypt_loop_block_192
 #else
-	BNE.N	L_AES_GCM_encrypt_loop_block_192
+	BNE.W	L_AES_GCM_encrypt_loop_block_192
 #endif
+#ifdef __GNUC__
 	B	L_AES_GCM_encrypt_end
+#else
+	B.W	L_AES_GCM_encrypt_end
+#endif
 L_AES_GCM_encrypt_start_block_128:
 L_AES_GCM_encrypt_loop_block_128:
 	PUSH	{r1, r2, lr}
@@ -3244,7 +3304,7 @@ L_AES_GCM_encrypt_loop_block_128:
 	ADD	r7, r7, #0x1
 	LDM	r3!, {r8, r9, r10, r11}
 	STR	r7, [lr, #12]
-	# Round: 0 - XOR in key schedule
+	/* Round: 0 - XOR in key schedule */
 	EOR	r4, r4, r8
 	EOR	r5, r5, r9
 	EOR	r6, r6, r10
@@ -3274,10 +3334,10 @@ L_AES_GCM_encrypt_loop_block_128:
 	SUBS	r2, r2, #0x10
 	ADD	lr, lr, #0x10
 	ADD	r1, r1, #0x10
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_AES_GCM_encrypt_loop_block_128
 #else
-	BNE.N	L_AES_GCM_encrypt_loop_block_128
+	BNE.W	L_AES_GCM_encrypt_loop_block_128
 #endif
 L_AES_GCM_encrypt_end:
 	POP	{r3, r8}
@@ -3287,7 +3347,7 @@ L_AES_GCM_encrypt_end:
 	REV	r7, r7
 	STM	r8, {r4, r5, r6, r7}
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 275
+	/* Cycle Count = 275 */
 	.size	AES_GCM_encrypt,.-AES_GCM_encrypt
 #endif /* HAVE_AESGCM */
 #endif /* !NO_AES */
diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c
index 48b5edc16c..7d5357f1a2 100644
--- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c
@@ -39,7 +39,7 @@
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef WOLFSSL_ARMASM
-#if !defined(__aarch64__) && defined(__arm__)
+#if !defined(__aarch64__) && defined(__thumb__)
 
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
@@ -208,9 +208,9 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
     register unsigned char* ks __asm__ ("r0") = (unsigned char*)ks_p;
     register word32 rounds __asm__ ("r1") = (word32)rounds_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_te_c __asm__ ("r2") = (uint32_t*)L_AES_Thumb2_te;
     register uint32_t* L_AES_Thumb2_td_c __asm__ ("r3") = (uint32_t*)L_AES_Thumb2_td;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "MOV	r12, %[L_AES_Thumb2_te]\n\t"
@@ -218,7 +218,7 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
         "ADD	r10, %[ks], %[rounds], LSL #4\n\t"
         "MOV	r11, %[rounds]\n\t"
         "\n"
-    "L_AES_invert_key_loop_%=:\n\t"
+    "L_AES_invert_key_loop:\n\t"
         "LDM	%[ks], {r2, r3, r4, r5}\n\t"
         "LDM	r10, {r6, r7, r8, r9}\n\t"
         "STM	r10, {r2, r3, r4, r5}\n\t"
@@ -226,15 +226,15 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
         "SUBS	r11, r11, #0x2\n\t"
         "SUB	r10, r10, #0x10\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_invert_key_loop_%=\n\t"
+        "BNE	L_AES_invert_key_loop\n\t"
 #else
-        "BNE.N	L_AES_invert_key_loop_%=\n\t"
+        "BNE.N	L_AES_invert_key_loop\n\t"
 #endif
         "SUB	%[ks], %[ks], %[rounds], LSL #3\n\t"
         "ADD	%[ks], %[ks], #0x10\n\t"
         "SUB	r11, %[rounds], #0x1\n\t"
         "\n"
-    "L_AES_invert_key_mix_loop_%=:\n\t"
+    "L_AES_invert_key_mix_loop:\n\t"
         "LDM	%[ks], {r2, r3, r4, r5}\n\t"
         "UBFX	r6, r2, #0, #8\n\t"
         "UBFX	r7, r2, #8, #8\n\t"
@@ -301,13 +301,19 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
         "EOR	r8, r8, r9, ROR #24\n\t"
         "STR	r8, [%[ks]], #4\n\t"
         "SUBS	r11, r11, #0x1\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_invert_key_mix_loop_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_invert_key_mix_loop\n\t"
 #else
-        "BNE.N	L_AES_invert_key_mix_loop_%=\n\t"
+        "BNE.W	L_AES_invert_key_mix_loop\n\t"
 #endif
-        : [ks] "+r" (ks), [rounds] "+r" (rounds), [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [ks] "+r" (ks), [rounds] "+r" (rounds),
+          [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c)
         :
+#else
+        : [ks] "+r" (ks), [rounds] "+r" (rounds)
+        : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_td] "r" (L_AES_Thumb2_td)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -316,7 +322,7 @@ void AES_invert_key(unsigned char* ks, word32 rounds)
 static const uint32_t L_AES_Thumb2_rcon[] = {
     0x01000000, 0x02000000, 0x04000000, 0x08000000,
     0x10000000, 0x20000000, 0x40000000, 0x80000000,
-    0x1b000000, 0x36000000, 
+    0x1b000000, 0x36000000
 };
 
 void AES_set_encrypt_key(const unsigned char* key, word32 len,
@@ -331,24 +337,24 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks
     register const unsigned char* key __asm__ ("r0") = (const unsigned char*)key_p;
     register word32 len __asm__ ("r1") = (word32)len_p;
     register unsigned char* ks __asm__ ("r2") = (unsigned char*)ks_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_te_c __asm__ ("r3") = (uint32_t*)L_AES_Thumb2_te;
     register uint32_t* L_AES_Thumb2_rcon_c __asm__ ("r4") = (uint32_t*)&L_AES_Thumb2_rcon;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
-        "MOV	r8, %[L_AES_Thumb2_te]\n\t"
+        "MOV	r10, %[L_AES_Thumb2_te]\n\t"
         "MOV	lr, %[L_AES_Thumb2_rcon]\n\t"
         "CMP	%[len], #0x80\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_set_encrypt_key_start_128_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_set_encrypt_key_start_128\n\t"
 #else
-        "BEQ.N	L_AES_set_encrypt_key_start_128_%=\n\t"
+        "BEQ.W	L_AES_set_encrypt_key_start_128\n\t"
 #endif
         "CMP	%[len], #0xc0\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_set_encrypt_key_start_192_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_set_encrypt_key_start_192\n\t"
 #else
-        "BEQ.N	L_AES_set_encrypt_key_start_192_%=\n\t"
+        "BEQ.W	L_AES_set_encrypt_key_start_192\n\t"
 #endif
         "LDRD	r4, r5, [%[key]]\n\t"
         "LDRD	r6, r7, [%[key], #8]\n\t"
@@ -367,15 +373,15 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks
         "SUB	%[ks], %[ks], #0x10\n\t"
         "MOV	r12, #0x6\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_256_%=:\n\t"
+    "L_AES_set_encrypt_key_loop_256:\n\t"
         "UBFX	r4, r7, #0, #8\n\t"
         "UBFX	r5, r7, #8, #8\n\t"
         "UBFX	r6, r7, #16, #8\n\t"
         "LSR	r7, r7, #24\n\t"
-        "LDRB	r4, [r8, r4, LSL #2]\n\t"
-        "LDRB	r5, [r8, r5, LSL #2]\n\t"
-        "LDRB	r6, [r8, r6, LSL #2]\n\t"
-        "LDRB	r7, [r8, r7, LSL #2]\n\t"
+        "LDRB	r4, [r10, r4, LSL #2]\n\t"
+        "LDRB	r5, [r10, r5, LSL #2]\n\t"
+        "LDRB	r6, [r10, r6, LSL #2]\n\t"
+        "LDRB	r7, [r10, r7, LSL #2]\n\t"
         "EOR	r3, r7, r4, LSL #8\n\t"
         "EOR	r3, r3, r5, LSL #16\n\t"
         "EOR	r3, r3, r6, LSL #24\n\t"
@@ -394,10 +400,10 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks
         "UBFX	r5, r3, #16, #8\n\t"
         "LSR	r6, r3, #24\n\t"
         "UBFX	r3, r3, #0, #8\n\t"
-        "LDRB	r4, [r8, r4, LSL #2]\n\t"
-        "LDRB	r6, [r8, r6, LSL #2]\n\t"
-        "LDRB	r5, [r8, r5, LSL #2]\n\t"
-        "LDRB	r3, [r8, r3, LSL #2]\n\t"
+        "LDRB	r4, [r10, r4, LSL #2]\n\t"
+        "LDRB	r6, [r10, r6, LSL #2]\n\t"
+        "LDRB	r5, [r10, r5, LSL #2]\n\t"
+        "LDRB	r3, [r10, r3, LSL #2]\n\t"
         "EOR	r3, r3, r4, LSL #8\n\t"
         "EOR	r3, r3, r5, LSL #16\n\t"
         "EOR	r3, r3, r6, LSL #24\n\t"
@@ -411,18 +417,18 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks
         "SUB	%[ks], %[ks], #0x10\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_set_encrypt_key_loop_256_%=\n\t"
+        "BNE	L_AES_set_encrypt_key_loop_256\n\t"
 #else
-        "BNE.N	L_AES_set_encrypt_key_loop_256_%=\n\t"
+        "BNE.N	L_AES_set_encrypt_key_loop_256\n\t"
 #endif
         "UBFX	r4, r7, #0, #8\n\t"
         "UBFX	r5, r7, #8, #8\n\t"
         "UBFX	r6, r7, #16, #8\n\t"
         "LSR	r7, r7, #24\n\t"
-        "LDRB	r4, [r8, r4, LSL #2]\n\t"
-        "LDRB	r5, [r8, r5, LSL #2]\n\t"
-        "LDRB	r6, [r8, r6, LSL #2]\n\t"
-        "LDRB	r7, [r8, r7, LSL #2]\n\t"
+        "LDRB	r4, [r10, r4, LSL #2]\n\t"
+        "LDRB	r5, [r10, r5, LSL #2]\n\t"
+        "LDRB	r6, [r10, r6, LSL #2]\n\t"
+        "LDRB	r7, [r10, r7, LSL #2]\n\t"
         "EOR	r3, r7, r4, LSL #8\n\t"
         "EOR	r3, r3, r5, LSL #16\n\t"
         "EOR	r3, r3, r6, LSL #24\n\t"
@@ -436,73 +442,81 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks
         "ADD	%[ks], %[ks], #0x10\n\t"
         "STM	%[ks], {r4, r5, r6, r7}\n\t"
         "SUB	%[ks], %[ks], #0x10\n\t"
-        "B	L_AES_set_encrypt_key_end_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_AES_set_encrypt_key_end\n\t"
+#else
+        "B.N	L_AES_set_encrypt_key_end\n\t"
+#endif
         "\n"
-    "L_AES_set_encrypt_key_start_192_%=:\n\t"
+    "L_AES_set_encrypt_key_start_192:\n\t"
         "LDRD	r4, r5, [%[key]]\n\t"
         "LDRD	r6, r7, [%[key], #8]\n\t"
-        "LDRD	%[key], %[len], [%[key], #16]\n\t"
+        "LDRD	r8, r9, [%[key], #16]\n\t"
         "REV	r4, r4\n\t"
         "REV	r5, r5\n\t"
         "REV	r6, r6\n\t"
         "REV	r7, r7\n\t"
-        "REV	%[key], %[key]\n\t"
-        "REV	%[len], %[len]\n\t"
+        "REV	r8, r8\n\t"
+        "REV	r9, r9\n\t"
         "STM	%[ks], {r4, r5, r6, r7}\n\t"
-        "STRD	%[key], %[len], [%[ks], #16]\n\t"
-        "MOV	r7, %[len]\n\t"
+        "STRD	r8, r9, [%[ks], #16]\n\t"
+        "MOV	r7, r9\n\t"
         "MOV	r12, #0x7\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_192_%=:\n\t"
-        "UBFX	r0, r7, #0, #8\n\t"
-        "UBFX	r1, r7, #8, #8\n\t"
-        "UBFX	r4, r7, #16, #8\n\t"
-        "LSR	r7, r7, #24\n\t"
-        "LDRB	r0, [r8, r0, LSL #2]\n\t"
-        "LDRB	r1, [r8, r1, LSL #2]\n\t"
-        "LDRB	r4, [r8, r4, LSL #2]\n\t"
-        "LDRB	r7, [r8, r7, LSL #2]\n\t"
-        "EOR	r3, r7, r0, LSL #8\n\t"
-        "EOR	r3, r3, r1, LSL #16\n\t"
-        "EOR	r3, r3, r4, LSL #24\n\t"
-        "LDM	%[ks]!, {r0, r1, r4, r5, r6, r7}\n\t"
-        "EOR	r0, r0, r3\n\t"
+    "L_AES_set_encrypt_key_loop_192:\n\t"
+        "UBFX	r4, r9, #0, #8\n\t"
+        "UBFX	r5, r9, #8, #8\n\t"
+        "UBFX	r6, r9, #16, #8\n\t"
+        "LSR	r9, r9, #24\n\t"
+        "LDRB	r4, [r10, r4, LSL #2]\n\t"
+        "LDRB	r5, [r10, r5, LSL #2]\n\t"
+        "LDRB	r6, [r10, r6, LSL #2]\n\t"
+        "LDRB	r9, [r10, r9, LSL #2]\n\t"
+        "EOR	r3, r9, r4, LSL #8\n\t"
+        "EOR	r3, r3, r5, LSL #16\n\t"
+        "EOR	r3, r3, r6, LSL #24\n\t"
+        "LDM	%[ks]!, {r4, r5, r6, r7, r8, r9}\n\t"
+        "EOR	r4, r4, r3\n\t"
         "LDM	lr!, {r3}\n\t"
-        "EOR	r0, r0, r3\n\t"
-        "EOR	r1, r1, r0\n\t"
-        "EOR	r4, r4, r1\n\t"
+        "EOR	r4, r4, r3\n\t"
         "EOR	r5, r5, r4\n\t"
         "EOR	r6, r6, r5\n\t"
         "EOR	r7, r7, r6\n\t"
-        "STM	%[ks], {r0, r1, r4, r5, r6, r7}\n\t"
+        "EOR	r8, r8, r7\n\t"
+        "EOR	r9, r9, r8\n\t"
+        "STM	%[ks], {r4, r5, r6, r7, r8, r9}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_set_encrypt_key_loop_192_%=\n\t"
+        "BNE	L_AES_set_encrypt_key_loop_192\n\t"
 #else
-        "BNE.N	L_AES_set_encrypt_key_loop_192_%=\n\t"
+        "BNE.N	L_AES_set_encrypt_key_loop_192\n\t"
 #endif
-        "UBFX	r0, r7, #0, #8\n\t"
-        "UBFX	r1, r7, #8, #8\n\t"
-        "UBFX	r4, r7, #16, #8\n\t"
-        "LSR	r7, r7, #24\n\t"
-        "LDRB	r0, [r8, r0, LSL #2]\n\t"
-        "LDRB	r1, [r8, r1, LSL #2]\n\t"
-        "LDRB	r4, [r8, r4, LSL #2]\n\t"
-        "LDRB	r7, [r8, r7, LSL #2]\n\t"
-        "EOR	r3, r7, r0, LSL #8\n\t"
-        "EOR	r3, r3, r1, LSL #16\n\t"
-        "EOR	r3, r3, r4, LSL #24\n\t"
-        "LDM	%[ks]!, {r0, r1, r4, r5, r6, r7}\n\t"
-        "EOR	r0, r0, r3\n\t"
+        "UBFX	r4, r9, #0, #8\n\t"
+        "UBFX	r5, r9, #8, #8\n\t"
+        "UBFX	r6, r9, #16, #8\n\t"
+        "LSR	r9, r9, #24\n\t"
+        "LDRB	r4, [r10, r4, LSL #2]\n\t"
+        "LDRB	r5, [r10, r5, LSL #2]\n\t"
+        "LDRB	r6, [r10, r6, LSL #2]\n\t"
+        "LDRB	r9, [r10, r9, LSL #2]\n\t"
+        "EOR	r3, r9, r4, LSL #8\n\t"
+        "EOR	r3, r3, r5, LSL #16\n\t"
+        "EOR	r3, r3, r6, LSL #24\n\t"
+        "LDM	%[ks]!, {r4, r5, r6, r7, r8, r9}\n\t"
+        "EOR	r4, r4, r3\n\t"
         "LDM	lr!, {r3}\n\t"
-        "EOR	r0, r0, r3\n\t"
-        "EOR	r1, r1, r0\n\t"
-        "EOR	r4, r4, r1\n\t"
+        "EOR	r4, r4, r3\n\t"
         "EOR	r5, r5, r4\n\t"
-        "STM	%[ks], {r0, r1, r4, r5}\n\t"
-        "B	L_AES_set_encrypt_key_end_%=\n\t"
+        "EOR	r6, r6, r5\n\t"
+        "EOR	r7, r7, r6\n\t"
+        "STM	%[ks], {r4, r5, r6, r7}\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_AES_set_encrypt_key_end\n\t"
+#else
+        "B.N	L_AES_set_encrypt_key_end\n\t"
+#endif
         "\n"
-    "L_AES_set_encrypt_key_start_128_%=:\n\t"
+    "L_AES_set_encrypt_key_start_128:\n\t"
         "LDRD	r4, r5, [%[key]]\n\t"
         "LDRD	r6, r7, [%[key], #8]\n\t"
         "REV	r4, r4\n\t"
@@ -512,15 +526,15 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks
         "STM	%[ks], {r4, r5, r6, r7}\n\t"
         "MOV	r12, #0xa\n\t"
         "\n"
-    "L_AES_set_encrypt_key_loop_128_%=:\n\t"
+    "L_AES_set_encrypt_key_loop_128:\n\t"
         "UBFX	r4, r7, #0, #8\n\t"
         "UBFX	r5, r7, #8, #8\n\t"
         "UBFX	r6, r7, #16, #8\n\t"
         "LSR	r7, r7, #24\n\t"
-        "LDRB	r4, [r8, r4, LSL #2]\n\t"
-        "LDRB	r5, [r8, r5, LSL #2]\n\t"
-        "LDRB	r6, [r8, r6, LSL #2]\n\t"
-        "LDRB	r7, [r8, r7, LSL #2]\n\t"
+        "LDRB	r4, [r10, r4, LSL #2]\n\t"
+        "LDRB	r5, [r10, r5, LSL #2]\n\t"
+        "LDRB	r6, [r10, r6, LSL #2]\n\t"
+        "LDRB	r7, [r10, r7, LSL #2]\n\t"
         "EOR	r3, r7, r4, LSL #8\n\t"
         "EOR	r3, r3, r5, LSL #16\n\t"
         "EOR	r3, r3, r6, LSL #24\n\t"
@@ -534,15 +548,21 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks
         "STM	%[ks], {r4, r5, r6, r7}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_set_encrypt_key_loop_128_%=\n\t"
+        "BNE	L_AES_set_encrypt_key_loop_128\n\t"
 #else
-        "BNE.N	L_AES_set_encrypt_key_loop_128_%=\n\t"
+        "BNE.N	L_AES_set_encrypt_key_loop_128\n\t"
 #endif
         "\n"
-    "L_AES_set_encrypt_key_end_%=:\n\t"
-        : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c)
+    "L_AES_set_encrypt_key_end:\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks),
+          [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c)
         :
-        : "memory", "r12", "lr", "r5", "r6", "r7", "r8"
+#else
+        : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks)
+        : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_rcon] "r" (L_AES_Thumb2_rcon)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
+        : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10"
     );
 }
 
@@ -562,7 +582,7 @@ void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks)
 
     __asm__ __volatile__ (
         "\n"
-    "L_AES_encrypt_block_nr_%=:\n\t"
+    "L_AES_encrypt_block_nr:\n\t"
         "UBFX	r8, r5, #16, #8\n\t"
         "LSR	r11, r4, #24\n\t"
         "UBFX	lr, r6, #8, #8\n\t"
@@ -664,10 +684,10 @@ void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks)
         "EOR	r6, r6, r10\n\t"
         "EOR	r7, r7, r11\n\t"
         "SUBS	%[nr], %[nr], #0x1\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_encrypt_block_nr_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_encrypt_block_nr\n\t"
 #else
-        "BNE.N	L_AES_encrypt_block_nr_%=\n\t"
+        "BNE.W	L_AES_encrypt_block_nr\n\t"
 #endif
         "UBFX	r8, r5, #16, #8\n\t"
         "LSR	r11, r4, #24\n\t"
@@ -793,28 +813,32 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long
     register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
     register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p;
     register int nr __asm__ ("r4") = (int)nr_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_te_ecb_c __asm__ ("r5") = (uint32_t*)L_AES_Thumb2_te_ecb;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "MOV	lr, %[in]\n\t"
         "MOV	r0, %[L_AES_Thumb2_te_ecb]\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r12, r4\n\t"
+#else
+        "LDR	r12, [sp, #36]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         "PUSH	{%[ks]}\n\t"
         "CMP	r12, #0xa\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_ECB_encrypt_start_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_ECB_encrypt_start_block_128\n\t"
 #else
-        "BEQ.N	L_AES_ECB_encrypt_start_block_128_%=\n\t"
+        "BEQ.W	L_AES_ECB_encrypt_start_block_128\n\t"
 #endif
         "CMP	r12, #0xc\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_ECB_encrypt_start_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_ECB_encrypt_start_block_192\n\t"
 #else
-        "BEQ.N	L_AES_ECB_encrypt_start_block_192_%=\n\t"
+        "BEQ.W	L_AES_ECB_encrypt_start_block_192\n\t"
 #endif
         "\n"
-    "L_AES_ECB_encrypt_loop_block_256_%=:\n\t"
+    "L_AES_ECB_encrypt_loop_block_256:\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
         "LDR	r6, [lr, #8]\n\t"
@@ -845,16 +869,20 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_ECB_encrypt_loop_block_256\n\t"
+#else
+        "BNE.W	L_AES_ECB_encrypt_loop_block_256\n\t"
+#endif
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_ECB_encrypt_loop_block_256_%=\n\t"
+        "B	L_AES_ECB_encrypt_end\n\t"
 #else
-        "BNE.N	L_AES_ECB_encrypt_loop_block_256_%=\n\t"
+        "B.N	L_AES_ECB_encrypt_end\n\t"
 #endif
-        "B	L_AES_ECB_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_start_block_192_%=:\n\t"
+    "L_AES_ECB_encrypt_start_block_192:\n\t"
         "\n"
-    "L_AES_ECB_encrypt_loop_block_192_%=:\n\t"
+    "L_AES_ECB_encrypt_loop_block_192:\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
         "LDR	r6, [lr, #8]\n\t"
@@ -885,16 +913,20 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_ECB_encrypt_loop_block_192\n\t"
+#else
+        "BNE.W	L_AES_ECB_encrypt_loop_block_192\n\t"
+#endif
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_ECB_encrypt_loop_block_192_%=\n\t"
+        "B	L_AES_ECB_encrypt_end\n\t"
 #else
-        "BNE.N	L_AES_ECB_encrypt_loop_block_192_%=\n\t"
+        "B.N	L_AES_ECB_encrypt_end\n\t"
 #endif
-        "B	L_AES_ECB_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_encrypt_start_block_128_%=:\n\t"
+    "L_AES_ECB_encrypt_start_block_128:\n\t"
         "\n"
-    "L_AES_ECB_encrypt_loop_block_128_%=:\n\t"
+    "L_AES_ECB_encrypt_loop_block_128:\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
         "LDR	r6, [lr, #8]\n\t"
@@ -925,16 +957,22 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_ECB_encrypt_loop_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_ECB_encrypt_loop_block_128\n\t"
 #else
-        "BNE.N	L_AES_ECB_encrypt_loop_block_128_%=\n\t"
+        "BNE.W	L_AES_ECB_encrypt_loop_block_128\n\t"
 #endif
         "\n"
-    "L_AES_ECB_encrypt_end_%=:\n\t"
+    "L_AES_ECB_encrypt_end:\n\t"
         "POP	{%[ks]}\n\t"
-        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr),
+          [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
         :
+#else
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr)
+        : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -956,30 +994,38 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long
     register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p;
     register int nr __asm__ ("r4") = (int)nr_p;
     register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_te_ecb_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_te_ecb;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r8, r4\n\t"
+#else
+        "LDR	r8, [sp, #36]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r9, r5\n\t"
+#else
+        "LDR	r9, [sp, #40]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         "MOV	lr, %[in]\n\t"
         "MOV	r0, %[L_AES_Thumb2_te_ecb]\n\t"
         "LDM	r9, {r4, r5, r6, r7}\n\t"
         "PUSH	{%[ks], r9}\n\t"
         "CMP	r8, #0xa\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CBC_encrypt_start_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CBC_encrypt_start_block_128\n\t"
 #else
-        "BEQ.N	L_AES_CBC_encrypt_start_block_128_%=\n\t"
+        "BEQ.W	L_AES_CBC_encrypt_start_block_128\n\t"
 #endif
         "CMP	r8, #0xc\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CBC_encrypt_start_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CBC_encrypt_start_block_192\n\t"
 #else
-        "BEQ.N	L_AES_CBC_encrypt_start_block_192_%=\n\t"
+        "BEQ.W	L_AES_CBC_encrypt_start_block_192\n\t"
 #endif
         "\n"
-    "L_AES_CBC_encrypt_loop_block_256_%=:\n\t"
+    "L_AES_CBC_encrypt_loop_block_256:\n\t"
         "LDR	r8, [lr]\n\t"
         "LDR	r9, [lr, #4]\n\t"
         "LDR	r10, [lr, #8]\n\t"
@@ -1014,16 +1060,20 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CBC_encrypt_loop_block_256\n\t"
+#else
+        "BNE.W	L_AES_CBC_encrypt_loop_block_256\n\t"
+#endif
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CBC_encrypt_loop_block_256_%=\n\t"
+        "B	L_AES_CBC_encrypt_end\n\t"
 #else
-        "BNE.N	L_AES_CBC_encrypt_loop_block_256_%=\n\t"
+        "B.N	L_AES_CBC_encrypt_end\n\t"
 #endif
-        "B	L_AES_CBC_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_encrypt_start_block_192_%=:\n\t"
+    "L_AES_CBC_encrypt_start_block_192:\n\t"
         "\n"
-    "L_AES_CBC_encrypt_loop_block_192_%=:\n\t"
+    "L_AES_CBC_encrypt_loop_block_192:\n\t"
         "LDR	r8, [lr]\n\t"
         "LDR	r9, [lr, #4]\n\t"
         "LDR	r10, [lr, #8]\n\t"
@@ -1058,16 +1108,20 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CBC_encrypt_loop_block_192\n\t"
+#else
+        "BNE.W	L_AES_CBC_encrypt_loop_block_192\n\t"
+#endif
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CBC_encrypt_loop_block_192_%=\n\t"
+        "B	L_AES_CBC_encrypt_end\n\t"
 #else
-        "BNE.N	L_AES_CBC_encrypt_loop_block_192_%=\n\t"
+        "B.N	L_AES_CBC_encrypt_end\n\t"
 #endif
-        "B	L_AES_CBC_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_encrypt_start_block_128_%=:\n\t"
+    "L_AES_CBC_encrypt_start_block_128:\n\t"
         "\n"
-    "L_AES_CBC_encrypt_loop_block_128_%=:\n\t"
+    "L_AES_CBC_encrypt_loop_block_128:\n\t"
         "LDR	r8, [lr]\n\t"
         "LDR	r9, [lr, #4]\n\t"
         "LDR	r10, [lr, #8]\n\t"
@@ -1102,17 +1156,23 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CBC_encrypt_loop_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CBC_encrypt_loop_block_128\n\t"
 #else
-        "BNE.N	L_AES_CBC_encrypt_loop_block_128_%=\n\t"
+        "BNE.W	L_AES_CBC_encrypt_loop_block_128\n\t"
 #endif
         "\n"
-    "L_AES_CBC_encrypt_end_%=:\n\t"
+    "L_AES_CBC_encrypt_end:\n\t"
         "POP	{%[ks], r9}\n\t"
         "STM	r9, {r4, r5, r6, r7}\n\t"
-        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv),
+          [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
         :
+#else
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv)
+        : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -1134,12 +1194,20 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long
     register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p;
     register int nr __asm__ ("r4") = (int)nr_p;
     register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_te_ecb_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_te_ecb;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r12, r4\n\t"
+#else
+        "LDR	r12, [sp, #36]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r8, r5\n\t"
+#else
+        "LDR	r8, [sp, #40]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         "MOV	lr, %[in]\n\t"
         "MOV	r0, %[L_AES_Thumb2_te_ecb]\n\t"
         "LDM	r8, {r4, r5, r6, r7}\n\t"
@@ -1150,19 +1218,19 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "STM	r8, {r4, r5, r6, r7}\n\t"
         "PUSH	{%[ks], r8}\n\t"
         "CMP	r12, #0xa\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CTR_encrypt_start_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CTR_encrypt_start_block_128\n\t"
 #else
-        "BEQ.N	L_AES_CTR_encrypt_start_block_128_%=\n\t"
+        "BEQ.W	L_AES_CTR_encrypt_start_block_128\n\t"
 #endif
         "CMP	r12, #0xc\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CTR_encrypt_start_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CTR_encrypt_start_block_192\n\t"
 #else
-        "BEQ.N	L_AES_CTR_encrypt_start_block_192_%=\n\t"
+        "BEQ.W	L_AES_CTR_encrypt_start_block_192\n\t"
 #endif
         "\n"
-    "L_AES_CTR_encrypt_loop_block_256_%=:\n\t"
+    "L_AES_CTR_encrypt_loop_block_256:\n\t"
         "PUSH	{r1, %[len], lr}\n\t"
         "LDR	lr, [sp, #16]\n\t"
         "ADDS	r11, r7, #0x1\n\t"
@@ -1201,16 +1269,20 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CTR_encrypt_loop_block_256_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CTR_encrypt_loop_block_256\n\t"
 #else
-        "BNE.N	L_AES_CTR_encrypt_loop_block_256_%=\n\t"
+        "BNE.W	L_AES_CTR_encrypt_loop_block_256\n\t"
+#endif
+#ifdef __GNUC__
+        "B	L_AES_CTR_encrypt_end\n\t"
+#else
+        "B.W	L_AES_CTR_encrypt_end\n\t"
 #endif
-        "B	L_AES_CTR_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_start_block_192_%=:\n\t"
+    "L_AES_CTR_encrypt_start_block_192:\n\t"
         "\n"
-    "L_AES_CTR_encrypt_loop_block_192_%=:\n\t"
+    "L_AES_CTR_encrypt_loop_block_192:\n\t"
         "PUSH	{r1, %[len], lr}\n\t"
         "LDR	lr, [sp, #16]\n\t"
         "ADDS	r11, r7, #0x1\n\t"
@@ -1249,16 +1321,20 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CTR_encrypt_loop_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CTR_encrypt_loop_block_192\n\t"
+#else
+        "BNE.W	L_AES_CTR_encrypt_loop_block_192\n\t"
+#endif
+#ifdef __GNUC__
+        "B	L_AES_CTR_encrypt_end\n\t"
 #else
-        "BNE.N	L_AES_CTR_encrypt_loop_block_192_%=\n\t"
+        "B.W	L_AES_CTR_encrypt_end\n\t"
 #endif
-        "B	L_AES_CTR_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_CTR_encrypt_start_block_128_%=:\n\t"
+    "L_AES_CTR_encrypt_start_block_128:\n\t"
         "\n"
-    "L_AES_CTR_encrypt_loop_block_128_%=:\n\t"
+    "L_AES_CTR_encrypt_loop_block_128:\n\t"
         "PUSH	{r1, %[len], lr}\n\t"
         "LDR	lr, [sp, #16]\n\t"
         "ADDS	r11, r7, #0x1\n\t"
@@ -1297,21 +1373,27 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CTR_encrypt_loop_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CTR_encrypt_loop_block_128\n\t"
 #else
-        "BNE.N	L_AES_CTR_encrypt_loop_block_128_%=\n\t"
+        "BNE.W	L_AES_CTR_encrypt_loop_block_128\n\t"
 #endif
         "\n"
-    "L_AES_CTR_encrypt_end_%=:\n\t"
+    "L_AES_CTR_encrypt_end:\n\t"
         "POP	{%[ks], r8}\n\t"
         "REV	r4, r4\n\t"
         "REV	r5, r5\n\t"
         "REV	r6, r6\n\t"
         "REV	r7, r7\n\t"
         "STM	r8, {r4, r5, r6, r7}\n\t"
-        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr),
+          [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c)
         :
+#else
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr)
+        : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -1334,7 +1416,7 @@ void AES_decrypt_block(const uint32_t* td, int nr, const uint8_t* td4)
 
     __asm__ __volatile__ (
         "\n"
-    "L_AES_decrypt_block_nr_%=:\n\t"
+    "L_AES_decrypt_block_nr:\n\t"
         "UBFX	r8, r7, #16, #8\n\t"
         "LSR	r11, r4, #24\n\t"
         "UBFX	r12, r6, #8, #8\n\t"
@@ -1436,10 +1518,10 @@ void AES_decrypt_block(const uint32_t* td, int nr, const uint8_t* td4)
         "EOR	r6, r6, r10\n\t"
         "EOR	r7, r7, r11\n\t"
         "SUBS	%[nr], %[nr], #0x1\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_decrypt_block_nr_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_decrypt_block_nr\n\t"
 #else
-        "BNE.N	L_AES_decrypt_block_nr_%=\n\t"
+        "BNE.W	L_AES_decrypt_block_nr\n\t"
 #endif
         "UBFX	r8, r7, #16, #8\n\t"
         "LSR	r11, r4, #24\n\t"
@@ -1598,30 +1680,34 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long
     register unsigned long len __asm__ ("r2") = (unsigned long)len_p;
     register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p;
     register int nr __asm__ ("r4") = (int)nr_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_td_ecb_c __asm__ ("r5") = (uint32_t*)L_AES_Thumb2_td_ecb;
     register unsigned char* L_AES_Thumb2_td4_c __asm__ ("r6") = (unsigned char*)&L_AES_Thumb2_td4;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r8, r4\n\t"
+#else
+        "LDR	r8, [sp, #36]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         "MOV	lr, %[in]\n\t"
         "MOV	r0, %[L_AES_Thumb2_td_ecb]\n\t"
         "MOV	r12, %[len]\n\t"
         "MOV	r2, %[L_AES_Thumb2_td4]\n\t"
         "CMP	r8, #0xa\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_ECB_decrypt_start_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_ECB_decrypt_start_block_128\n\t"
 #else
-        "BEQ.N	L_AES_ECB_decrypt_start_block_128_%=\n\t"
+        "BEQ.W	L_AES_ECB_decrypt_start_block_128\n\t"
 #endif
         "CMP	r8, #0xc\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_ECB_decrypt_start_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_ECB_decrypt_start_block_192\n\t"
 #else
-        "BEQ.N	L_AES_ECB_decrypt_start_block_192_%=\n\t"
+        "BEQ.W	L_AES_ECB_decrypt_start_block_192\n\t"
 #endif
         "\n"
-    "L_AES_ECB_decrypt_loop_block_256_%=:\n\t"
+    "L_AES_ECB_decrypt_loop_block_256:\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
         "LDR	r6, [lr, #8]\n\t"
@@ -1651,16 +1737,20 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_ECB_decrypt_loop_block_256\n\t"
+#else
+        "BNE.W	L_AES_ECB_decrypt_loop_block_256\n\t"
+#endif
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_ECB_decrypt_loop_block_256_%=\n\t"
+        "B	L_AES_ECB_decrypt_end\n\t"
 #else
-        "BNE.N	L_AES_ECB_decrypt_loop_block_256_%=\n\t"
+        "B.N	L_AES_ECB_decrypt_end\n\t"
 #endif
-        "B	L_AES_ECB_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_start_block_192_%=:\n\t"
+    "L_AES_ECB_decrypt_start_block_192:\n\t"
         "\n"
-    "L_AES_ECB_decrypt_loop_block_192_%=:\n\t"
+    "L_AES_ECB_decrypt_loop_block_192:\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
         "LDR	r6, [lr, #8]\n\t"
@@ -1690,16 +1780,20 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_ECB_decrypt_loop_block_192\n\t"
+#else
+        "BNE.W	L_AES_ECB_decrypt_loop_block_192\n\t"
+#endif
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_ECB_decrypt_loop_block_192_%=\n\t"
+        "B	L_AES_ECB_decrypt_end\n\t"
 #else
-        "BNE.N	L_AES_ECB_decrypt_loop_block_192_%=\n\t"
+        "B.N	L_AES_ECB_decrypt_end\n\t"
 #endif
-        "B	L_AES_ECB_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_ECB_decrypt_start_block_128_%=:\n\t"
+    "L_AES_ECB_decrypt_start_block_128:\n\t"
         "\n"
-    "L_AES_ECB_decrypt_loop_block_128_%=:\n\t"
+    "L_AES_ECB_decrypt_loop_block_128:\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
         "LDR	r6, [lr, #8]\n\t"
@@ -1729,15 +1823,21 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_ECB_decrypt_loop_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_ECB_decrypt_loop_block_128\n\t"
 #else
-        "BNE.N	L_AES_ECB_decrypt_loop_block_128_%=\n\t"
+        "BNE.W	L_AES_ECB_decrypt_loop_block_128\n\t"
 #endif
         "\n"
-    "L_AES_ECB_decrypt_end_%=:\n\t"
-        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c)
+    "L_AES_ECB_decrypt_end:\n\t"
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr),
+          [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c)
         :
+#else
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr)
+        : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -1759,32 +1859,40 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long
     register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p;
     register int nr __asm__ ("r4") = (int)nr_p;
     register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_td_ecb_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_td_ecb;
     register unsigned char* L_AES_Thumb2_td4_c __asm__ ("r7") = (unsigned char*)&L_AES_Thumb2_td4;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r8, r4\n\t"
+#else
+        "LDR	r8, [sp, #36]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r4, r5\n\t"
+#else
+        "LDR	r4, [sp, #40]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         "MOV	lr, %[in]\n\t"
         "MOV	r0, %[L_AES_Thumb2_td_ecb]\n\t"
         "MOV	r12, %[len]\n\t"
         "MOV	r2, %[L_AES_Thumb2_td4]\n\t"
         "PUSH	{%[ks], r4}\n\t"
         "CMP	r8, #0xa\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CBC_decrypt_loop_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CBC_decrypt_loop_block_128\n\t"
 #else
-        "BEQ.N	L_AES_CBC_decrypt_loop_block_128_%=\n\t"
+        "BEQ.W	L_AES_CBC_decrypt_loop_block_128\n\t"
 #endif
         "CMP	r8, #0xc\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CBC_decrypt_loop_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CBC_decrypt_loop_block_192\n\t"
 #else
-        "BEQ.N	L_AES_CBC_decrypt_loop_block_192_%=\n\t"
+        "BEQ.W	L_AES_CBC_decrypt_loop_block_192\n\t"
 #endif
         "\n"
-    "L_AES_CBC_decrypt_loop_block_256_%=:\n\t"
+    "L_AES_CBC_decrypt_loop_block_256:\n\t"
         "PUSH	{r1, r12, lr}\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
@@ -1824,10 +1932,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CBC_decrypt_end_odd_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CBC_decrypt_end_odd\n\t"
 #else
-        "BEQ.N	L_AES_CBC_decrypt_end_odd_%=\n\t"
+        "BEQ.W	L_AES_CBC_decrypt_end_odd\n\t"
 #endif
         "PUSH	{r1, r12, lr}\n\t"
         "LDR	r4, [lr]\n\t"
@@ -1869,14 +1977,18 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CBC_decrypt_loop_block_256_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CBC_decrypt_loop_block_256\n\t"
 #else
-        "BNE.N	L_AES_CBC_decrypt_loop_block_256_%=\n\t"
+        "BNE.W	L_AES_CBC_decrypt_loop_block_256\n\t"
+#endif
+#ifdef __GNUC__
+        "B	L_AES_CBC_decrypt_end\n\t"
+#else
+        "B.W	L_AES_CBC_decrypt_end\n\t"
 #endif
-        "B	L_AES_CBC_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_loop_block_192_%=:\n\t"
+    "L_AES_CBC_decrypt_loop_block_192:\n\t"
         "PUSH	{r1, r12, lr}\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
@@ -1916,10 +2028,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CBC_decrypt_end_odd_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CBC_decrypt_end_odd\n\t"
 #else
-        "BEQ.N	L_AES_CBC_decrypt_end_odd_%=\n\t"
+        "BEQ.W	L_AES_CBC_decrypt_end_odd\n\t"
 #endif
         "PUSH	{r1, r12, lr}\n\t"
         "LDR	r4, [lr]\n\t"
@@ -1961,14 +2073,18 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CBC_decrypt_loop_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CBC_decrypt_loop_block_192\n\t"
+#else
+        "BNE.W	L_AES_CBC_decrypt_loop_block_192\n\t"
+#endif
+#ifdef __GNUC__
+        "B	L_AES_CBC_decrypt_end\n\t"
 #else
-        "BNE.N	L_AES_CBC_decrypt_loop_block_192_%=\n\t"
+        "B.W	L_AES_CBC_decrypt_end\n\t"
 #endif
-        "B	L_AES_CBC_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_loop_block_128_%=:\n\t"
+    "L_AES_CBC_decrypt_loop_block_128:\n\t"
         "PUSH	{r1, r12, lr}\n\t"
         "LDR	r4, [lr]\n\t"
         "LDR	r5, [lr, #4]\n\t"
@@ -2008,10 +2124,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_CBC_decrypt_end_odd_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_CBC_decrypt_end_odd\n\t"
 #else
-        "BEQ.N	L_AES_CBC_decrypt_end_odd_%=\n\t"
+        "BEQ.W	L_AES_CBC_decrypt_end_odd\n\t"
 #endif
         "PUSH	{r1, r12, lr}\n\t"
         "LDR	r4, [lr]\n\t"
@@ -2053,24 +2169,34 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	r12, r12, #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_CBC_decrypt_loop_block_128\n\t"
+#else
+        "BNE.W	L_AES_CBC_decrypt_loop_block_128\n\t"
+#endif
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_CBC_decrypt_loop_block_128_%=\n\t"
+        "B	L_AES_CBC_decrypt_end\n\t"
 #else
-        "BNE.N	L_AES_CBC_decrypt_loop_block_128_%=\n\t"
+        "B.N	L_AES_CBC_decrypt_end\n\t"
 #endif
-        "B	L_AES_CBC_decrypt_end_%=\n\t"
         "\n"
-    "L_AES_CBC_decrypt_end_odd_%=:\n\t"
+    "L_AES_CBC_decrypt_end_odd:\n\t"
         "LDR	r4, [sp, #4]\n\t"
         "LDRD	r8, r9, [r4, #16]\n\t"
         "LDRD	r10, r11, [r4, #24]\n\t"
         "STRD	r8, r9, [r4]\n\t"
         "STRD	r10, r11, [r4, #8]\n\t"
         "\n"
-    "L_AES_CBC_decrypt_end_%=:\n\t"
+    "L_AES_CBC_decrypt_end:\n\t"
         "POP	{%[ks], r4}\n\t"
-        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv),
+          [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c)
         :
+#else
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv)
+        : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r8", "r9", "r10", "r11"
     );
 }
@@ -2099,13 +2225,13 @@ void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned cha
     register const unsigned char** m __asm__ ("r1") = (const unsigned char**)m_p;
     register const unsigned char* data __asm__ ("r2") = (const unsigned char*)data_p;
     register unsigned long len __asm__ ("r3") = (unsigned long)len_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_GCM_gmult_len_r_c __asm__ ("r4") = (uint32_t*)&L_GCM_gmult_len_r;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "MOV	lr, %[L_GCM_gmult_len_r]\n\t"
         "\n"
-    "L_GCM_gmult_len_start_block_%=:\n\t"
+    "L_GCM_gmult_len_start_block:\n\t"
         "PUSH	{r3}\n\t"
         "LDR	r12, [r0, #12]\n\t"
         "LDR	%[len], [r2, #12]\n\t"
@@ -2650,13 +2776,19 @@ void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned cha
         "POP	{r3}\n\t"
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	%[data], %[data], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_GCM_gmult_len_start_block_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_GCM_gmult_len_start_block\n\t"
 #else
-        "BNE.N	L_GCM_gmult_len_start_block_%=\n\t"
+        "BNE.W	L_GCM_gmult_len_start_block\n\t"
 #endif
-        : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len),
+          [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c)
         :
+#else
+        : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len)
+        : [L_GCM_gmult_len_r] "r" (L_GCM_gmult_len_r)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -2677,12 +2809,20 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long
     register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p;
     register int nr __asm__ ("r4") = (int)nr_p;
     register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_AES_Thumb2_te_gcm_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_te_gcm;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r12, r4\n\t"
+#else
+        "LDR	r12, [sp, #36]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
         "MOV	r8, r5\n\t"
+#else
+        "LDR	r8, [sp, #40]\n\t"
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
         "MOV	lr, %[in]\n\t"
         "MOV	r0, %[L_AES_Thumb2_te_gcm]\n\t"
         "LDM	r8, {r4, r5, r6, r7}\n\t"
@@ -2693,19 +2833,19 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "STM	r8, {r4, r5, r6, r7}\n\t"
         "PUSH	{%[ks], r8}\n\t"
         "CMP	r12, #0xa\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_GCM_encrypt_start_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_GCM_encrypt_start_block_128\n\t"
 #else
-        "BEQ.N	L_AES_GCM_encrypt_start_block_128_%=\n\t"
+        "BEQ.W	L_AES_GCM_encrypt_start_block_128\n\t"
 #endif
         "CMP	r12, #0xc\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_AES_GCM_encrypt_start_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BEQ	L_AES_GCM_encrypt_start_block_192\n\t"
 #else
-        "BEQ.N	L_AES_GCM_encrypt_start_block_192_%=\n\t"
+        "BEQ.W	L_AES_GCM_encrypt_start_block_192\n\t"
 #endif
         "\n"
-    "L_AES_GCM_encrypt_loop_block_256_%=:\n\t"
+    "L_AES_GCM_encrypt_loop_block_256:\n\t"
         "PUSH	{r1, %[len], lr}\n\t"
         "LDR	lr, [sp, #16]\n\t"
         "ADD	r7, r7, #0x1\n\t"
@@ -2741,16 +2881,20 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_GCM_encrypt_loop_block_256_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_GCM_encrypt_loop_block_256\n\t"
+#else
+        "BNE.W	L_AES_GCM_encrypt_loop_block_256\n\t"
+#endif
+#ifdef __GNUC__
+        "B	L_AES_GCM_encrypt_end\n\t"
 #else
-        "BNE.N	L_AES_GCM_encrypt_loop_block_256_%=\n\t"
+        "B.W	L_AES_GCM_encrypt_end\n\t"
 #endif
-        "B	L_AES_GCM_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_start_block_192_%=:\n\t"
+    "L_AES_GCM_encrypt_start_block_192:\n\t"
         "\n"
-    "L_AES_GCM_encrypt_loop_block_192_%=:\n\t"
+    "L_AES_GCM_encrypt_loop_block_192:\n\t"
         "PUSH	{r1, %[len], lr}\n\t"
         "LDR	lr, [sp, #16]\n\t"
         "ADD	r7, r7, #0x1\n\t"
@@ -2786,16 +2930,20 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_GCM_encrypt_loop_block_192_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_GCM_encrypt_loop_block_192\n\t"
 #else
-        "BNE.N	L_AES_GCM_encrypt_loop_block_192_%=\n\t"
+        "BNE.W	L_AES_GCM_encrypt_loop_block_192\n\t"
+#endif
+#ifdef __GNUC__
+        "B	L_AES_GCM_encrypt_end\n\t"
+#else
+        "B.W	L_AES_GCM_encrypt_end\n\t"
 #endif
-        "B	L_AES_GCM_encrypt_end_%=\n\t"
         "\n"
-    "L_AES_GCM_encrypt_start_block_128_%=:\n\t"
+    "L_AES_GCM_encrypt_start_block_128:\n\t"
         "\n"
-    "L_AES_GCM_encrypt_loop_block_128_%=:\n\t"
+    "L_AES_GCM_encrypt_loop_block_128:\n\t"
         "PUSH	{r1, %[len], lr}\n\t"
         "LDR	lr, [sp, #16]\n\t"
         "ADD	r7, r7, #0x1\n\t"
@@ -2831,21 +2979,27 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long
         "SUBS	%[len], %[len], #0x10\n\t"
         "ADD	lr, lr, #0x10\n\t"
         "ADD	%[out], %[out], #0x10\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_AES_GCM_encrypt_loop_block_128_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_AES_GCM_encrypt_loop_block_128\n\t"
 #else
-        "BNE.N	L_AES_GCM_encrypt_loop_block_128_%=\n\t"
+        "BNE.W	L_AES_GCM_encrypt_loop_block_128\n\t"
 #endif
         "\n"
-    "L_AES_GCM_encrypt_end_%=:\n\t"
+    "L_AES_GCM_encrypt_end:\n\t"
         "POP	{%[ks], r8}\n\t"
         "REV	r4, r4\n\t"
         "REV	r5, r5\n\t"
         "REV	r6, r6\n\t"
         "REV	r7, r7\n\t"
         "STM	r8, {r4, r5, r6, r7}\n\t"
-        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr),
+          [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c)
         :
+#else
+        : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr)
+        : [L_AES_Thumb2_te_gcm] "r" (L_AES_Thumb2_te_gcm)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11"
     );
 }
@@ -2854,7 +3008,7 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long
 #endif /* !NO_AES */
 #endif /* !__aarch64__ && __thumb__ */
 #endif /* WOLFSSL_ARMASM */
-#endif /* !defined(__aarch64__) && defined(__arm__) */
+#endif /* !defined(__aarch64__) && defined(__thumb__) */
 #endif /* WOLFSSL_ARMASM */
 
 #endif /* WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S
index c5ca56b18a..e6b5dcf5d2 100644
--- a/wolfcrypt/src/port/arm/thumb2-curve25519.S
+++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S
@@ -43,7 +43,7 @@
 	.type	fe_init, %function
 fe_init:
 	BX	lr
-	# Cycle Count = 4
+	/* Cycle Count = 4 */
 	.size	fe_init,.-fe_init
 	.text
 	.align	4
@@ -51,62 +51,62 @@ fe_init:
 	.type	fe_add_sub_op, %function
 fe_add_sub_op:
 	PUSH	{lr}
-	# Add-Sub
+	/* Add-Sub */
 	LDRD	r4, r5, [r2]
 	LDRD	r6, r7, [r3]
-	#  Add
+	/*  Add */
 	ADDS	r8, r4, r6
 	MOV	r12, #0x0
 	ADCS	r9, r5, r7
 	ADC	r12, r12, #0x0
 	STRD	r8, r9, [r0]
-	#  Sub
+	/*  Sub */
 	SUBS	r10, r4, r6
 	SBCS	r11, r5, r7
 	STRD	r10, r11, [r1]
 	LDRD	r4, r5, [r2, #8]
 	LDRD	r6, r7, [r3, #8]
-	#  Sub
+	/*  Sub */
 	SBCS	r10, r4, r6
 	MOV	lr, #0x0
 	SBCS	r11, r5, r7
 	ADC	lr, lr, #0x0
 	STRD	r10, r11, [r1, #8]
-	#  Add
+	/*  Add */
 	SUBS	r12, r12, #0x1
 	ADCS	r8, r4, r6
 	ADCS	r9, r5, r7
 	STRD	r8, r9, [r0, #8]
 	LDRD	r4, r5, [r2, #16]
 	LDRD	r6, r7, [r3, #16]
-	#  Add
+	/*  Add */
 	ADCS	r8, r4, r6
 	MOV	r12, #0x0
 	ADCS	r9, r5, r7
 	ADC	r12, r12, #0x0
 	STRD	r8, r9, [r0, #16]
-	#  Sub
+	/*  Sub */
 	SUBS	lr, lr, #0x1
 	SBCS	r10, r4, r6
 	SBCS	r11, r5, r7
 	STRD	r10, r11, [r1, #16]
 	LDRD	r4, r5, [r2, #24]
 	LDRD	r6, r7, [r3, #24]
-	#  Sub
+	/*  Sub */
 	SBCS	r10, r4, r6
 	SBC	r11, r5, r7
-	#  Add
+	/*  Add */
 	SUBS	r12, r12, #0x1
 	ADCS	r8, r4, r6
 	MOV	r12, #0x0
 	ADCS	r9, r5, r7
 	ADC	r12, r12, #0x0
-	#   Multiply -modulus by overflow
+	/*   Multiply -modulus by overflow */
 	LSL	r3, r12, #1
 	MOV	r12, #0x13
 	ORR	r3, r3, r9, LSR #31
 	MUL	r12, r3, r12
-	#   Add -x*modulus (if overflow)
+	/*   Add -x*modulus (if overflow) */
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
 	ADDS	r4, r4, r12
@@ -123,7 +123,7 @@ fe_add_sub_op:
 	ADCS	r8, r8, #0x0
 	ADC	r9, r9, #0x0
 	STRD	r8, r9, [r0, #24]
-	#   Add -modulus on underflow
+	/*   Add -modulus on underflow */
 	MOV	lr, #0x13
 	AND	lr, lr, r11, ASR #31
 	LDM	r1, {r4, r5, r6, r7, r8, r9}
@@ -137,9 +137,9 @@ fe_add_sub_op:
 	SBCS	r10, r10, #0x0
 	SBC	r11, r11, #0x0
 	STM	r1, {r4, r5, r6, r7, r8, r9, r10, r11}
-	# Done Add-Sub
+	/* Done Add-Sub */
 	POP	{pc}
-	# Cycle Count = 134
+	/* Cycle Count = 134 */
 	.size	fe_add_sub_op,.-fe_add_sub_op
 	.text
 	.align	4
@@ -147,7 +147,7 @@ fe_add_sub_op:
 	.type	fe_sub_op, %function
 fe_sub_op:
 	PUSH	{lr}
-	# Sub
+	/* Sub */
 	LDM	r2!, {r6, r7, r8, r9, r10, r11, r12, lr}
 	LDM	r1!, {r2, r3, r4, r5}
 	SUBS	r6, r2, r6
@@ -171,9 +171,9 @@ fe_sub_op:
 	SBCS	r12, r12, #0x0
 	SBC	lr, lr, #0x0
 	STM	r0, {r6, r7, r8, r9, r10, r11, r12, lr}
-	# Done Sub
+	/* Done Sub */
 	POP	{pc}
-	# Cycle Count = 51
+	/* Cycle Count = 51 */
 	.size	fe_sub_op,.-fe_sub_op
 	.text
 	.align	4
@@ -183,7 +183,7 @@ fe_sub:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	BL	fe_sub_op
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 24
+	/* Cycle Count = 24 */
 	.size	fe_sub,.-fe_sub
 	.text
 	.align	4
@@ -191,7 +191,7 @@ fe_sub:
 	.type	fe_add_op, %function
 fe_add_op:
 	PUSH	{lr}
-	# Add
+	/* Add */
 	LDM	r2!, {r6, r7, r8, r9, r10, r11, r12, lr}
 	LDM	r1!, {r2, r3, r4, r5}
 	ADDS	r6, r2, r6
@@ -215,9 +215,9 @@ fe_add_op:
 	ADCS	r12, r12, #0x0
 	ADC	lr, lr, #0x0
 	STM	r0, {r6, r7, r8, r9, r10, r11, r12, lr}
-	# Done Add
+	/* Done Add */
 	POP	{pc}
-	# Cycle Count = 51
+	/* Cycle Count = 51 */
 	.size	fe_add_op,.-fe_add_op
 	.text
 	.align	4
@@ -227,7 +227,7 @@ fe_add:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	BL	fe_add_op
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 24
+	/* Cycle Count = 24 */
 	.size	fe_add,.-fe_add
 #ifdef HAVE_ED25519
 	.text
@@ -254,7 +254,7 @@ fe_frombytes:
 	STR	r8, [r0, #24]
 	STR	r9, [r0, #28]
 	POP	{r4, r5, r6, r7, r8, r9, pc}
-	# Cycle Count = 49
+	/* Cycle Count = 49 */
 	.size	fe_frombytes,.-fe_frombytes
 	.text
 	.align	4
@@ -291,7 +291,7 @@ fe_tobytes:
 	STR	r8, [r0, #24]
 	STR	r9, [r0, #28]
 	POP	{r4, r5, r6, r7, r8, r9, r10, pc}
-	# Cycle Count = 62
+	/* Cycle Count = 62 */
 	.size	fe_tobytes,.-fe_tobytes
 	.text
 	.align	4
@@ -299,7 +299,7 @@ fe_tobytes:
 	.type	fe_1, %function
 fe_1:
 	PUSH	{r4, r5, r6, r7, r8, r9, lr}
-	# Set one
+	/* Set one */
 	MOV	r2, #0x1
 	MOV	r3, #0x0
 	MOV	r4, #0x0
@@ -310,7 +310,7 @@ fe_1:
 	MOV	r9, #0x0
 	STM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	POP	{r4, r5, r6, r7, r8, r9, pc}
-	# Cycle Count = 33
+	/* Cycle Count = 33 */
 	.size	fe_1,.-fe_1
 	.text
 	.align	4
@@ -318,7 +318,7 @@ fe_1:
 	.type	fe_0, %function
 fe_0:
 	PUSH	{r4, r5, r6, r7, r8, r9, lr}
-	# Set zero
+	/* Set zero */
 	MOV	r2, #0x0
 	MOV	r3, #0x0
 	MOV	r4, #0x0
@@ -329,7 +329,7 @@ fe_0:
 	MOV	r9, #0x0
 	STM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	POP	{r4, r5, r6, r7, r8, r9, pc}
-	# Cycle Count = 33
+	/* Cycle Count = 33 */
 	.size	fe_0,.-fe_0
 	.text
 	.align	4
@@ -337,7 +337,7 @@ fe_0:
 	.type	fe_copy, %function
 fe_copy:
 	PUSH	{r4, r5, lr}
-	# Copy
+	/* Copy */
 	LDRD	r2, r3, [r1]
 	LDRD	r4, r5, [r1, #8]
 	STRD	r2, r3, [r0]
@@ -347,7 +347,7 @@ fe_copy:
 	STRD	r2, r3, [r0, #16]
 	STRD	r4, r5, [r0, #24]
 	POP	{r4, r5, pc}
-	# Cycle Count = 32
+	/* Cycle Count = 32 */
 	.size	fe_copy,.-fe_copy
 	.text
 	.align	4
@@ -371,7 +371,7 @@ fe_neg:
 	SBC	r5, r6, r5
 	STM	r0!, {r2, r3, r4, r5}
 	POP	{r4, r5, r6, r7, pc}
-	# Cycle Count = 43
+	/* Cycle Count = 43 */
 	.size	fe_neg,.-fe_neg
 	.text
 	.align	4
@@ -407,7 +407,7 @@ fe_isnonzero:
 	ORR	r2, r2, r8
 	ORR	r0, r2, r4
 	POP	{r4, r5, r6, r7, r8, r9, r10, pc}
-	# Cycle Count = 53
+	/* Cycle Count = 53 */
 	.size	fe_isnonzero,.-fe_isnonzero
 	.text
 	.align	4
@@ -430,7 +430,7 @@ fe_isnegative:
 	LSR	r1, r1, #31
 	EOR	r0, r0, r1
 	POP	{r4, r5, pc}
-	# Cycle Count = 31
+	/* Cycle Count = 31 */
 	.size	fe_isnegative,.-fe_isnegative
 #if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN)
 #ifndef WC_NO_CACHE_RESISTANT
@@ -1404,7 +1404,7 @@ fe_cmov_table:
 	STRD	r6, r7, [r0, #56]
 	STRD	r8, r9, [r0, #88]
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 1195
+	/* Cycle Count = 1195 */
 	.size	fe_cmov_table,.-fe_cmov_table
 #else
 	.text
@@ -1506,7 +1506,7 @@ fe_cmov_table:
 	STM	r0!, {r4, r5, r6, r7}
 	SUB	r1, r1, r2
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 160
+	/* Cycle Count = 160 */
 	.size	fe_cmov_table,.-fe_cmov_table
 #endif /* WC_NO_CACHE_RESISTANT */
 #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */
@@ -1522,329 +1522,329 @@ fe_mul_op:
 	STR	r0, [sp, #36]
 	MOV	r0, #0x0
 	LDR	r12, [r1]
-	# A[0] * B[0]
+	/* A[0] * B[0] */
 	LDR	lr, [r2]
 	UMULL	r3, r4, r12, lr
-	# A[0] * B[2]
+	/* A[0] * B[2] */
 	LDR	lr, [r2, #8]
 	UMULL	r5, r6, r12, lr
-	# A[0] * B[4]
+	/* A[0] * B[4] */
 	LDR	lr, [r2, #16]
 	UMULL	r7, r8, r12, lr
-	# A[0] * B[6]
+	/* A[0] * B[6] */
 	LDR	lr, [r2, #24]
 	UMULL	r9, r10, r12, lr
 	STR	r3, [sp]
-	# A[0] * B[1]
+	/* A[0] * B[1] */
 	LDR	lr, [r2, #4]
 	MOV	r11, r0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[0] * B[3]
+	/* A[0] * B[3] */
 	LDR	lr, [r2, #12]
 	ADCS	r6, r6, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[0] * B[5]
+	/* A[0] * B[5] */
 	LDR	lr, [r2, #20]
 	ADCS	r8, r8, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[0] * B[7]
+	/* A[0] * B[7] */
 	LDR	lr, [r2, #28]
 	ADCS	r10, r10, #0x0
 	ADC	r3, r0, #0x0
 	UMLAL	r10, r3, r12, lr
-	# A[1] * B[0]
+	/* A[1] * B[0] */
 	LDR	r12, [r1, #4]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r4, r11, r12, lr
 	STR	r4, [sp, #4]
 	ADDS	r5, r5, r11
-	# A[1] * B[1]
+	/* A[1] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[1] * B[2]
+	/* A[1] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[1] * B[3]
+	/* A[1] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[1] * B[4]
+	/* A[1] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[1] * B[5]
+	/* A[1] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[1] * B[6]
+	/* A[1] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[1] * B[7]
+	/* A[1] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r4, r0, #0x0
 	UMLAL	r3, r4, r12, lr
-	# A[2] * B[0]
+	/* A[2] * B[0] */
 	LDR	r12, [r1, #8]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r5, r11, r12, lr
 	STR	r5, [sp, #8]
 	ADDS	r6, r6, r11
-	# A[2] * B[1]
+	/* A[2] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[2] * B[2]
+	/* A[2] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[2] * B[3]
+	/* A[2] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[2] * B[4]
+	/* A[2] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[2] * B[5]
+	/* A[2] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[2] * B[6]
+	/* A[2] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[2] * B[7]
+	/* A[2] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r5, r0, #0x0
 	UMLAL	r4, r5, r12, lr
-	# A[3] * B[0]
+	/* A[3] * B[0] */
 	LDR	r12, [r1, #12]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r6, r11, r12, lr
 	STR	r6, [sp, #12]
 	ADDS	r7, r7, r11
-	# A[3] * B[1]
+	/* A[3] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[3] * B[2]
+	/* A[3] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[3] * B[3]
+	/* A[3] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[3] * B[4]
+	/* A[3] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[3] * B[5]
+	/* A[3] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[3] * B[6]
+	/* A[3] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[3] * B[7]
+	/* A[3] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r6, r0, #0x0
 	UMLAL	r5, r6, r12, lr
-	# A[4] * B[0]
+	/* A[4] * B[0] */
 	LDR	r12, [r1, #16]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r7, r11, r12, lr
 	STR	r7, [sp, #16]
 	ADDS	r8, r8, r11
-	# A[4] * B[1]
+	/* A[4] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[4] * B[2]
+	/* A[4] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[4] * B[3]
+	/* A[4] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[4] * B[4]
+	/* A[4] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[4] * B[5]
+	/* A[4] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[4] * B[6]
+	/* A[4] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[4] * B[7]
+	/* A[4] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r7, r0, #0x0
 	UMLAL	r6, r7, r12, lr
-	# A[5] * B[0]
+	/* A[5] * B[0] */
 	LDR	r12, [r1, #20]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r8, r11, r12, lr
 	STR	r8, [sp, #20]
 	ADDS	r9, r9, r11
-	# A[5] * B[1]
+	/* A[5] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[5] * B[2]
+	/* A[5] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[5] * B[3]
+	/* A[5] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[5] * B[4]
+	/* A[5] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[5] * B[5]
+	/* A[5] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[5] * B[6]
+	/* A[5] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[5] * B[7]
+	/* A[5] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r8, r0, #0x0
 	UMLAL	r7, r8, r12, lr
-	# A[6] * B[0]
+	/* A[6] * B[0] */
 	LDR	r12, [r1, #24]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r9, r11, r12, lr
 	STR	r9, [sp, #24]
 	ADDS	r10, r10, r11
-	# A[6] * B[1]
+	/* A[6] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[6] * B[2]
+	/* A[6] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[6] * B[3]
+	/* A[6] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[6] * B[4]
+	/* A[6] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[6] * B[5]
+	/* A[6] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[6] * B[6]
+	/* A[6] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[6] * B[7]
+	/* A[6] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r9, r0, #0x0
 	UMLAL	r8, r9, r12, lr
-	# A[7] * B[0]
+	/* A[7] * B[0] */
 	LDR	r12, [r1, #28]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r10, r11, r12, lr
 	STR	r10, [sp, #28]
 	ADDS	r3, r3, r11
-	# A[7] * B[1]
+	/* A[7] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[7] * B[2]
+	/* A[7] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[7] * B[3]
+	/* A[7] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[7] * B[4]
+	/* A[7] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[7] * B[5]
+	/* A[7] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[7] * B[6]
+	/* A[7] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[7] * B[7]
+	/* A[7] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r10, r0, #0x0
 	UMLAL	r9, r10, r12, lr
-	# Reduce
+	/* Reduce */
 	LDR	r2, [sp, #28]
 	MOV	lr, sp
 	MOV	r12, #0x26
@@ -1883,12 +1883,12 @@ fe_mul_op:
 	UMLAL	r7, r11, r9, r12
 	BFC	r10, #31, #1
 	ADDS	r8, r10, r11
-	# Store
+	/* Store */
 	LDR	r0, [sp, #36]
 	STM	r0, {r1, r2, r3, r4, r5, r6, r7, r8}
 	ADD	sp, sp, #0x28
 	POP	{pc}
-	# Cycle Count = 406
+	/* Cycle Count = 406 */
 	.size	fe_mul_op,.-fe_mul_op
 #else
 	.text
@@ -1995,7 +1995,7 @@ fe_mul_op:
 	UMAAL	r8, r10, r2, lr
 	UMAAL	r8, r9, r3, r11
 	UMAAL	r9, r10, r3, lr
-	# Reduce
+	/* Reduce */
 	LDR	r0, [sp, #28]
 	MOV	lr, #0x25
 	UMAAL	r10, r0, r10, lr
@@ -2017,11 +2017,11 @@ fe_mul_op:
 	UMAAL	r6, r11, r9, lr
 	ADD	r7, r10, r11
 	LDR	lr, [sp, #8]
-	# Store
+	/* Store */
 	STM	lr, {r0, r1, r2, r3, r4, r5, r6, r7}
 	ADD	sp, sp, #0x10
 	POP	{pc}
-	# Cycle Count = 239
+	/* Cycle Count = 239 */
 	.size	fe_mul_op,.-fe_mul_op
 #endif /* WOLFSSL_SP_NO_UMAAL */
 	.text
@@ -2032,7 +2032,7 @@ fe_mul:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	BL	fe_mul_op
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 24
+	/* Cycle Count = 24 */
 	.size	fe_mul,.-fe_mul
 #ifdef WOLFSSL_SP_NO_UMAAL
 	.text
@@ -2043,33 +2043,33 @@ fe_sq_op:
 	PUSH	{lr}
 	SUB	sp, sp, #0x44
 	STR	r0, [sp, #64]
-	# Square
+	/* Square */
 	MOV	r0, #0x0
 	LDR	r12, [r1]
-	# A[0] * A[1]
+	/* A[0] * A[1] */
 	LDR	lr, [r1, #4]
 	UMULL	r4, r5, r12, lr
-	# A[0] * A[3]
+	/* A[0] * A[3] */
 	LDR	lr, [r1, #12]
 	UMULL	r6, r7, r12, lr
-	# A[0] * A[5]
+	/* A[0] * A[5] */
 	LDR	lr, [r1, #20]
 	UMULL	r8, r9, r12, lr
-	# A[0] * A[7]
+	/* A[0] * A[7] */
 	LDR	lr, [r1, #28]
 	UMULL	r10, r3, r12, lr
-	# A[0] * A[2]
+	/* A[0] * A[2] */
 	LDR	lr, [r1, #8]
 	MOV	r11, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[0] * A[4]
+	/* A[0] * A[4] */
 	LDR	lr, [r1, #16]
 	ADCS	r7, r7, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[0] * A[6]
+	/* A[0] * A[6] */
 	LDR	lr, [r1, #24]
 	ADCS	r9, r9, #0x0
 	ADC	r11, r0, #0x0
@@ -2078,112 +2078,112 @@ fe_sq_op:
 	ADCS	r3, r3, #0x0
 	STR	r4, [sp, #4]
 	STR	r5, [sp, #8]
-	# A[1] * A[2]
+	/* A[1] * A[2] */
 	LDR	r12, [r1, #4]
 	LDR	lr, [r1, #8]
 	MOV	r11, #0x0
 	UMLAL	r6, r11, r12, lr
 	STR	r6, [sp, #12]
 	ADDS	r7, r7, r11
-	# A[1] * A[3]
+	/* A[1] * A[3] */
 	LDR	lr, [r1, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	STR	r7, [sp, #16]
 	ADDS	r8, r8, r11
-	# A[1] * A[4]
+	/* A[1] * A[4] */
 	LDR	lr, [r1, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[1] * A[5]
+	/* A[1] * A[5] */
 	LDR	lr, [r1, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[1] * A[6]
+	/* A[1] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[1] * A[7]
+	/* A[1] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r4, r0, #0x0
 	UMLAL	r3, r4, r12, lr
-	# A[2] * A[3]
+	/* A[2] * A[3] */
 	LDR	r12, [r1, #8]
 	LDR	lr, [r1, #12]
 	MOV	r11, #0x0
 	UMLAL	r8, r11, r12, lr
 	STR	r8, [sp, #20]
 	ADDS	r9, r9, r11
-	# A[2] * A[4]
+	/* A[2] * A[4] */
 	LDR	lr, [r1, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	STR	r9, [sp, #24]
 	ADDS	r10, r10, r11
-	# A[2] * A[5]
+	/* A[2] * A[5] */
 	LDR	lr, [r1, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[2] * A[6]
+	/* A[2] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[2] * A[7]
+	/* A[2] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r5, r0, #0x0
 	UMLAL	r4, r5, r12, lr
-	# A[3] * A[4]
+	/* A[3] * A[4] */
 	LDR	r12, [r1, #12]
 	LDR	lr, [r1, #16]
 	MOV	r11, #0x0
 	UMLAL	r10, r11, r12, lr
 	STR	r10, [sp, #28]
 	ADDS	r3, r3, r11
-	# A[3] * A[5]
+	/* A[3] * A[5] */
 	LDR	lr, [r1, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[3] * A[6]
+	/* A[3] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[3] * A[7]
+	/* A[3] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r6, r0, #0x0
 	UMLAL	r5, r6, r12, lr
-	# A[4] * A[5]
+	/* A[4] * A[5] */
 	LDR	r12, [r1, #16]
 	LDR	lr, [r1, #20]
 	MOV	r11, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[4] * A[6]
+	/* A[4] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[4] * A[7]
+	/* A[4] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r7, r0, #0x0
 	UMLAL	r6, r7, r12, lr
-	# A[5] * A[6]
+	/* A[5] * A[6] */
 	LDR	r12, [r1, #20]
 	LDR	lr, [r1, #24]
 	MOV	r11, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[5] * A[7]
+	/* A[5] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r8, r0, #0x0
 	UMLAL	r7, r8, r12, lr
-	# A[6] * A[7]
+	/* A[6] * A[7] */
 	LDR	r12, [r1, #24]
 	LDR	lr, [r1, #28]
 	MOV	r9, #0x0
@@ -2213,23 +2213,23 @@ fe_sq_op:
 	ADD	lr, sp, #0x4
 	LDM	lr, {r4, r5, r6, r7, r8, r9, r10}
 	MOV	lr, sp
-	# A[0] * A[0]
+	/* A[0] * A[0] */
 	LDR	r12, [r1]
 	UMULL	r3, r11, r12, r12
 	ADDS	r4, r4, r11
-	# A[1] * A[1]
+	/* A[1] * A[1] */
 	LDR	r12, [r1, #4]
 	ADCS	r5, r5, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, r12
 	ADDS	r6, r6, r11
-	# A[2] * A[2]
+	/* A[2] * A[2] */
 	LDR	r12, [r1, #8]
 	ADCS	r7, r7, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, r12
 	ADDS	r8, r8, r11
-	# A[3] * A[3]
+	/* A[3] * A[3] */
 	LDR	r12, [r1, #12]
 	ADCS	r9, r9, #0x0
 	ADC	r11, r0, #0x0
@@ -2237,30 +2237,30 @@ fe_sq_op:
 	ADDS	r10, r10, r11
 	STM	lr!, {r3, r4, r5, r6, r7, r8, r9, r10}
 	LDM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}
-	# A[4] * A[4]
+	/* A[4] * A[4] */
 	LDR	r12, [r1, #16]
 	ADCS	r3, r3, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, r12
 	ADDS	r4, r4, r11
-	# A[5] * A[5]
+	/* A[5] * A[5] */
 	LDR	r12, [r1, #20]
 	ADCS	r5, r5, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, r12
 	ADDS	r6, r6, r11
-	# A[6] * A[6]
+	/* A[6] * A[6] */
 	LDR	r12, [r1, #24]
 	ADCS	r7, r7, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, r12
 	ADDS	r8, r8, r11
-	# A[7] * A[7]
+	/* A[7] * A[7] */
 	LDR	r12, [r1, #28]
 	ADCS	r9, r9, #0x0
 	ADC	r10, r10, #0x0
 	UMLAL	r9, r10, r12, r12
-	# Reduce
+	/* Reduce */
 	LDR	r2, [sp, #28]
 	MOV	lr, sp
 	MOV	r12, #0x26
@@ -2299,12 +2299,12 @@ fe_sq_op:
 	UMLAL	r7, r11, r9, r12
 	BFC	r10, #31, #1
 	ADDS	r8, r10, r11
-	# Store
+	/* Store */
 	LDR	r0, [sp, #64]
 	STM	r0, {r1, r2, r3, r4, r5, r6, r7, r8}
 	ADD	sp, sp, #0x44
 	POP	{pc}
-	# Cycle Count = 355
+	/* Cycle Count = 355 */
 	.size	fe_sq_op,.-fe_sq_op
 #else
 	.text
@@ -2316,7 +2316,7 @@ fe_sq_op:
 	SUB	sp, sp, #0x20
 	STR	r0, [sp, #28]
 	LDM	r1, {r0, r1, r2, r3, r4, r5, r6, r7}
-	# Square
+	/* Square */
 	UMULL	r9, r10, r0, r0
 	UMULL	r11, r12, r0, r1
 	ADDS	r11, r11, r11
@@ -2357,46 +2357,46 @@ fe_sq_op:
 	UMAAL	r0, r10, r3, r4
 	ADCS	r0, r0, r0
 	UMAAL	r0, r11, lr, lr
-	# R[7] = r0
+	/* R[7] = r0 */
 	UMAAL	r9, r8, r1, r7
 	UMAAL	r9, r10, r2, r6
 	UMAAL	r12, r9, r3, r5
 	ADCS	r12, r12, r12
 	UMAAL	r12, r11, r4, r4
-	# R[8] = r12
+	/* R[8] = r12 */
 	UMAAL	r9, r8, r2, r7
 	UMAAL	r10, r9, r3, r6
 	MOV	r2, lr
 	UMAAL	r10, r2, r4, r5
 	ADCS	r10, r10, r10
 	UMAAL	r11, r10, lr, lr
-	# R[9] = r11
+	/* R[9] = r11 */
 	UMAAL	r2, r8, r3, r7
 	UMAAL	r2, r9, r4, r6
 	ADCS	r3, r2, r2
 	UMAAL	r10, r3, r5, r5
-	# R[10] = r10
+	/* R[10] = r10 */
 	MOV	r1, lr
 	UMAAL	r1, r8, r4, r7
 	UMAAL	r1, r9, r5, r6
 	ADCS	r4, r1, r1
 	UMAAL	r3, r4, lr, lr
-	# R[11] = r3
+	/* R[11] = r3 */
 	UMAAL	r8, r9, r5, r7
 	ADCS	r8, r8, r8
 	UMAAL	r4, r8, r6, r6
-	# R[12] = r4
+	/* R[12] = r4 */
 	MOV	r5, lr
 	UMAAL	r5, r9, r6, r7
 	ADCS	r5, r5, r5
 	UMAAL	r8, r5, lr, lr
-	# R[13] = r8
+	/* R[13] = r8 */
 	ADCS	r9, r9, r9
 	UMAAL	r9, r5, r7, r7
 	ADCS	r7, r5, lr
-	# R[14] = r9
-	# R[15] = r7
-	# Reduce
+	/* R[14] = r9 */
+	/* R[15] = r7 */
+	/* Reduce */
 	MOV	r6, #0x25
 	UMAAL	r7, r0, r7, r6
 	MOV	r6, #0x13
@@ -2420,10 +2420,10 @@ fe_sq_op:
 	UMAAL	r6, lr, r9, r12
 	ADD	r7, r7, lr
 	POP	{lr}
-	# Store
+	/* Store */
 	STM	lr, {r0, r1, r2, r3, r4, r5, r6, r7}
 	POP	{pc}
-	# Cycle Count = 179
+	/* Cycle Count = 179 */
 	.size	fe_sq_op,.-fe_sq_op
 #endif /* WOLFSSL_SP_NO_UMAAL */
 	.text
@@ -2434,7 +2434,7 @@ fe_sq:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	BL	fe_sq_op
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 24
+	/* Cycle Count = 24 */
 	.size	fe_sq,.-fe_sq
 #ifdef HAVE_CURVE25519
 #ifdef WOLFSSL_SP_NO_UMAAL
@@ -2444,7 +2444,7 @@ fe_sq:
 	.type	fe_mul121666, %function
 fe_mul121666:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-	# Multiply by 121666
+	/* Multiply by 121666 */
 	LDM	r1, {r2, r3, r4, r5, r6, r7, r8, r9}
 	MOV	r12, #0xdb42
 	MOVT	r12, #0x1
@@ -2485,7 +2485,7 @@ fe_mul121666:
 	ADC	r9, r9, #0x0
 	STM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 75
+	/* Cycle Count = 75 */
 	.size	fe_mul121666,.-fe_mul121666
 #else
 	.text
@@ -2494,7 +2494,7 @@ fe_mul121666:
 	.type	fe_mul121666, %function
 fe_mul121666:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-	# Multiply by 121666
+	/* Multiply by 121666 */
 	LDM	r1, {r2, r3, r4, r5, r6, r7, r8, r9}
 	MOV	r11, #0xdb42
 	MOVT	r11, #0x1
@@ -2522,7 +2522,7 @@ fe_mul121666:
 	ADC	r9, r9, #0x0
 	STM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 69
+	/* Cycle Count = 69 */
 	.size	fe_mul121666,.-fe_mul121666
 #endif /* WOLFSSL_SP_NO_UMAAL */
 #ifndef WC_NO_CACHE_RESISTANT
@@ -2553,7 +2553,7 @@ curve25519:
 	MOV	r3, sp
 	STM	r3, {r4, r5, r6, r7, r8, r9, r10, r11}
 	ADD	r3, sp, #0x40
-	# Copy
+	/* Copy */
 	LDM	r2, {r4, r5, r6, r7, r8, r9, r10, r11}
 	STM	r3, {r4, r5, r6, r7, r8, r9, r10, r11}
 	MOV	r1, #0x1e
@@ -2572,7 +2572,7 @@ L_curve25519_bits:
 	EOR	r1, r1, r2
 	STR	r1, [sp, #172]
 	LDR	r0, [sp, #160]
-	# Conditional Swap
+	/* Conditional Swap */
 	RSB	r1, r1, #0x0
 	MOV	r3, r0
 	ADD	r12, sp, #0x40
@@ -2625,7 +2625,7 @@ L_curve25519_bits:
 	STM	r3!, {r4, r5}
 	STM	r12!, {r6, r7}
 	LDR	r1, [sp, #172]
-	# Conditional Swap
+	/* Conditional Swap */
 	RSB	r1, r1, #0x0
 	MOV	r3, sp
 	ADD	r12, sp, #0x20
@@ -2741,21 +2741,21 @@ L_curve25519_bits:
 	LDR	r1, [sp, #180]
 	SUBS	r1, r1, #0x1
 	STR	r1, [sp, #180]
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BGE	L_curve25519_bits
 #else
-	BGE.N	L_curve25519_bits
+	BGE.W	L_curve25519_bits
 #endif
 	MOV	r1, #0x1f
 	STR	r1, [sp, #180]
 	SUBS	r2, r2, #0x4
 	STR	r2, [sp, #176]
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BGE	L_curve25519_words
 #else
-	BGE.N	L_curve25519_words
+	BGE.W	L_curve25519_words
 #endif
-	# Invert
+	/* Invert */
 	ADD	r1, sp, #0x0
 	ADD	r0, sp, #0x20
 	BL	fe_sq_op
@@ -2938,7 +2938,7 @@ L_curve25519_inv_8:
 	MOV	r0, #0x0
 	ADD	sp, sp, #0xbc
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 682
+	/* Cycle Count = 682 */
 	.size	curve25519,.-curve25519
 #else
 	.text
@@ -2973,7 +2973,7 @@ curve25519:
 	MOV	r3, sp
 	STM	r3, {r4, r5, r6, r7, r8, r9, r10, r11}
 	ADD	r3, sp, #0x40
-	# Copy
+	/* Copy */
 	LDM	r2, {r4, r5, r6, r7, r8, r9, r10, r11}
 	STM	r3, {r4, r5, r6, r7, r8, r9, r10, r11}
 	MOV	r2, #0xfe
@@ -2989,7 +2989,7 @@ L_curve25519_bits:
 	EOR	r1, r1, r2
 	ASR	r1, r1, #31
 	STR	r2, [sp, #164]
-	# Conditional Swap
+	/* Conditional Swap */
 	ADD	r11, sp, #0xb0
 	LDM	r11, {r4, r5, r6, r7}
 	EOR	r8, r4, r5
@@ -3001,7 +3001,7 @@ L_curve25519_bits:
 	EOR	r6, r6, r9
 	EOR	r7, r7, r9
 	STM	r11, {r4, r5, r6, r7}
-	# Ladder step
+	/* Ladder step */
 	LDR	r3, [sp, #184]
 	LDR	r2, [sp, #176]
 	ADD	r1, sp, #0x80
@@ -3067,12 +3067,12 @@ L_curve25519_bits:
 #else
 	BGE.N	L_curve25519_bits
 #endif
-	#   Cycle Count: 171
+	/*   Cycle Count: 171 */
 	LDR	r1, [sp, #184]
-	# Copy
+	/* Copy */
 	LDM	r1, {r4, r5, r6, r7, r8, r9, r10, r11}
 	STM	sp, {r4, r5, r6, r7, r8, r9, r10, r11}
-	# Invert
+	/* Invert */
 	ADD	r1, sp, #0x0
 	ADD	r0, sp, #0x20
 	BL	fe_sq_op
@@ -3252,7 +3252,7 @@ L_curve25519_inv_8:
 	LDR	r1, [sp, #176]
 	LDR	r0, [sp, #176]
 	BL	fe_mul_op
-	# Ensure result is less than modulus
+	/* Ensure result is less than modulus */
 	LDR	r0, [sp, #176]
 	LDM	r0, {r4, r5, r6, r7, r8, r9, r10, r11}
 	MOV	r2, #0x13
@@ -3270,7 +3270,7 @@ L_curve25519_inv_8:
 	MOV	r0, #0x0
 	ADD	sp, sp, #0xc0
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 589
+	/* Cycle Count = 589 */
 	.size	curve25519,.-curve25519
 #endif /* WC_NO_CACHE_RESISTANT */
 #endif /* HAVE_CURVE25519 */
@@ -3282,7 +3282,7 @@ L_curve25519_inv_8:
 fe_invert:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	SUB	sp, sp, #0x88
-	# Invert
+	/* Invert */
 	STR	r0, [sp, #128]
 	STR	r1, [sp, #132]
 	LDR	r1, [sp, #132]
@@ -3464,7 +3464,7 @@ L_fe_invert8:
 	LDR	r0, [sp, #128]
 	ADD	sp, sp, #0x88
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 292
+	/* Cycle Count = 292 */
 	.size	fe_invert,.-fe_invert
 #ifdef WOLFSSL_SP_NO_UMAAL
 	.text
@@ -3475,33 +3475,33 @@ fe_sq2:
 	PUSH	{lr}
 	SUB	sp, sp, #0x44
 	STR	r0, [sp, #64]
-	# Square * 2
+	/* Square * 2 */
 	MOV	r0, #0x0
 	LDR	r12, [r1]
-	# A[0] * A[1]
+	/* A[0] * A[1] */
 	LDR	lr, [r1, #4]
 	UMULL	r4, r5, r12, lr
-	# A[0] * A[3]
+	/* A[0] * A[3] */
 	LDR	lr, [r1, #12]
 	UMULL	r6, r7, r12, lr
-	# A[0] * A[5]
+	/* A[0] * A[5] */
 	LDR	lr, [r1, #20]
 	UMULL	r8, r9, r12, lr
-	# A[0] * A[7]
+	/* A[0] * A[7] */
 	LDR	lr, [r1, #28]
 	UMULL	r10, r3, r12, lr
-	# A[0] * A[2]
+	/* A[0] * A[2] */
 	LDR	lr, [r1, #8]
 	MOV	r11, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[0] * A[4]
+	/* A[0] * A[4] */
 	LDR	lr, [r1, #16]
 	ADCS	r7, r7, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[0] * A[6]
+	/* A[0] * A[6] */
 	LDR	lr, [r1, #24]
 	ADCS	r9, r9, #0x0
 	ADC	r11, r0, #0x0
@@ -3510,112 +3510,112 @@ fe_sq2:
 	ADCS	r3, r3, #0x0
 	STR	r4, [sp, #4]
 	STR	r5, [sp, #8]
-	# A[1] * A[2]
+	/* A[1] * A[2] */
 	LDR	r12, [r1, #4]
 	LDR	lr, [r1, #8]
 	MOV	r11, #0x0
 	UMLAL	r6, r11, r12, lr
 	STR	r6, [sp, #12]
 	ADDS	r7, r7, r11
-	# A[1] * A[3]
+	/* A[1] * A[3] */
 	LDR	lr, [r1, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	STR	r7, [sp, #16]
 	ADDS	r8, r8, r11
-	# A[1] * A[4]
+	/* A[1] * A[4] */
 	LDR	lr, [r1, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[1] * A[5]
+	/* A[1] * A[5] */
 	LDR	lr, [r1, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[1] * A[6]
+	/* A[1] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[1] * A[7]
+	/* A[1] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r4, r0, #0x0
 	UMLAL	r3, r4, r12, lr
-	# A[2] * A[3]
+	/* A[2] * A[3] */
 	LDR	r12, [r1, #8]
 	LDR	lr, [r1, #12]
 	MOV	r11, #0x0
 	UMLAL	r8, r11, r12, lr
 	STR	r8, [sp, #20]
 	ADDS	r9, r9, r11
-	# A[2] * A[4]
+	/* A[2] * A[4] */
 	LDR	lr, [r1, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	STR	r9, [sp, #24]
 	ADDS	r10, r10, r11
-	# A[2] * A[5]
+	/* A[2] * A[5] */
 	LDR	lr, [r1, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[2] * A[6]
+	/* A[2] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[2] * A[7]
+	/* A[2] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r5, r0, #0x0
 	UMLAL	r4, r5, r12, lr
-	# A[3] * A[4]
+	/* A[3] * A[4] */
 	LDR	r12, [r1, #12]
 	LDR	lr, [r1, #16]
 	MOV	r11, #0x0
 	UMLAL	r10, r11, r12, lr
 	STR	r10, [sp, #28]
 	ADDS	r3, r3, r11
-	# A[3] * A[5]
+	/* A[3] * A[5] */
 	LDR	lr, [r1, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[3] * A[6]
+	/* A[3] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[3] * A[7]
+	/* A[3] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r6, r0, #0x0
 	UMLAL	r5, r6, r12, lr
-	# A[4] * A[5]
+	/* A[4] * A[5] */
 	LDR	r12, [r1, #16]
 	LDR	lr, [r1, #20]
 	MOV	r11, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[4] * A[6]
+	/* A[4] * A[6] */
 	LDR	lr, [r1, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[4] * A[7]
+	/* A[4] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r7, r0, #0x0
 	UMLAL	r6, r7, r12, lr
-	# A[5] * A[6]
+	/* A[5] * A[6] */
 	LDR	r12, [r1, #20]
 	LDR	lr, [r1, #24]
 	MOV	r11, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[5] * A[7]
+	/* A[5] * A[7] */
 	LDR	lr, [r1, #28]
 	ADC	r8, r0, #0x0
 	UMLAL	r7, r8, r12, lr
-	# A[6] * A[7]
+	/* A[6] * A[7] */
 	LDR	r12, [r1, #24]
 	LDR	lr, [r1, #28]
 	MOV	r9, #0x0
@@ -3645,23 +3645,23 @@ fe_sq2:
 	ADD	lr, sp, #0x4
 	LDM	lr, {r4, r5, r6, r7, r8, r9, r10}
 	MOV	lr, sp
-	# A[0] * A[0]
+	/* A[0] * A[0] */
 	LDR	r12, [r1]
 	UMULL	r3, r11, r12, r12
 	ADDS	r4, r4, r11
-	# A[1] * A[1]
+	/* A[1] * A[1] */
 	LDR	r12, [r1, #4]
 	ADCS	r5, r5, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, r12
 	ADDS	r6, r6, r11
-	# A[2] * A[2]
+	/* A[2] * A[2] */
 	LDR	r12, [r1, #8]
 	ADCS	r7, r7, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, r12
 	ADDS	r8, r8, r11
-	# A[3] * A[3]
+	/* A[3] * A[3] */
 	LDR	r12, [r1, #12]
 	ADCS	r9, r9, #0x0
 	ADC	r11, r0, #0x0
@@ -3669,30 +3669,30 @@ fe_sq2:
 	ADDS	r10, r10, r11
 	STM	lr!, {r3, r4, r5, r6, r7, r8, r9, r10}
 	LDM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}
-	# A[4] * A[4]
+	/* A[4] * A[4] */
 	LDR	r12, [r1, #16]
 	ADCS	r3, r3, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, r12
 	ADDS	r4, r4, r11
-	# A[5] * A[5]
+	/* A[5] * A[5] */
 	LDR	r12, [r1, #20]
 	ADCS	r5, r5, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, r12
 	ADDS	r6, r6, r11
-	# A[6] * A[6]
+	/* A[6] * A[6] */
 	LDR	r12, [r1, #24]
 	ADCS	r7, r7, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, r12
 	ADDS	r8, r8, r11
-	# A[7] * A[7]
+	/* A[7] * A[7] */
 	LDR	r12, [r1, #28]
 	ADCS	r9, r9, #0x0
 	ADC	r10, r10, #0x0
 	UMLAL	r9, r10, r12, r12
-	# Reduce
+	/* Reduce */
 	LDR	r2, [sp, #28]
 	MOV	lr, sp
 	MOV	r12, #0x26
@@ -3731,7 +3731,7 @@ fe_sq2:
 	UMLAL	r7, r11, r9, r12
 	BFC	r10, #31, #1
 	ADDS	r8, r10, r11
-	# Reduce if top bit set
+	/* Reduce if top bit set */
 	MOV	r12, #0x13
 	AND	r11, r12, r8, ASR #31
 	ADDS	r1, r1, r11
@@ -3743,7 +3743,7 @@ fe_sq2:
 	BFC	r8, #31, #1
 	ADCS	r7, r7, #0x0
 	ADC	r8, r8, #0x0
-	# Double
+	/* Double */
 	ADDS	r1, r1, r1
 	ADCS	r2, r2, r2
 	ADCS	r3, r3, r3
@@ -3752,7 +3752,7 @@ fe_sq2:
 	ADCS	r6, r6, r6
 	ADCS	r7, r7, r7
 	ADC	r8, r8, r8
-	# Reduce if top bit set
+	/* Reduce if top bit set */
 	MOV	r12, #0x13
 	AND	r11, r12, r8, ASR #31
 	ADDS	r1, r1, r11
@@ -3764,12 +3764,12 @@ fe_sq2:
 	BFC	r8, #31, #1
 	ADCS	r7, r7, #0x0
 	ADC	r8, r8, #0x0
-	# Store
+	/* Store */
 	LDR	r0, [sp, #64]
 	STM	r0, {r1, r2, r3, r4, r5, r6, r7, r8}
 	ADD	sp, sp, #0x44
 	POP	{pc}
-	# Cycle Count = 385
+	/* Cycle Count = 385 */
 	.size	fe_sq2,.-fe_sq2
 #else
 	.text
@@ -3781,7 +3781,7 @@ fe_sq2:
 	SUB	sp, sp, #0x24
 	STRD	r0, r1, [sp, #28]
 	LDM	r1, {r0, r1, r2, r3, r4, r5, r6, r7}
-	# Square * 2
+	/* Square * 2 */
 	UMULL	r9, r10, r0, r0
 	UMULL	r11, r12, r0, r1
 	ADDS	r11, r11, r11
@@ -3822,46 +3822,46 @@ fe_sq2:
 	UMAAL	r0, r10, r3, r4
 	ADCS	r0, r0, r0
 	UMAAL	r0, r11, lr, lr
-	# R[7] = r0
+	/* R[7] = r0 */
 	UMAAL	r9, r8, r1, r7
 	UMAAL	r9, r10, r2, r6
 	UMAAL	r12, r9, r3, r5
 	ADCS	r12, r12, r12
 	UMAAL	r12, r11, r4, r4
-	# R[8] = r12
+	/* R[8] = r12 */
 	UMAAL	r9, r8, r2, r7
 	UMAAL	r10, r9, r3, r6
 	MOV	r2, lr
 	UMAAL	r10, r2, r4, r5
 	ADCS	r10, r10, r10
 	UMAAL	r11, r10, lr, lr
-	# R[9] = r11
+	/* R[9] = r11 */
 	UMAAL	r2, r8, r3, r7
 	UMAAL	r2, r9, r4, r6
 	ADCS	r3, r2, r2
 	UMAAL	r10, r3, r5, r5
-	# R[10] = r10
+	/* R[10] = r10 */
 	MOV	r1, lr
 	UMAAL	r1, r8, r4, r7
 	UMAAL	r1, r9, r5, r6
 	ADCS	r4, r1, r1
 	UMAAL	r3, r4, lr, lr
-	# R[11] = r3
+	/* R[11] = r3 */
 	UMAAL	r8, r9, r5, r7
 	ADCS	r8, r8, r8
 	UMAAL	r4, r8, r6, r6
-	# R[12] = r4
+	/* R[12] = r4 */
 	MOV	r5, lr
 	UMAAL	r5, r9, r6, r7
 	ADCS	r5, r5, r5
 	UMAAL	r8, r5, lr, lr
-	# R[13] = r8
+	/* R[13] = r8 */
 	ADCS	r9, r9, r9
 	UMAAL	r9, r5, r7, r7
 	ADCS	r7, r5, lr
-	# R[14] = r9
-	# R[15] = r7
-	# Reduce
+	/* R[14] = r9 */
+	/* R[15] = r7 */
+	/* Reduce */
 	MOV	r6, #0x25
 	UMAAL	r7, r0, r7, r6
 	MOV	r6, #0x13
@@ -3884,7 +3884,7 @@ fe_sq2:
 	BFC	r7, #31, #1
 	UMAAL	r6, lr, r9, r12
 	ADD	r7, r7, lr
-	# Reduce if top bit set
+	/* Reduce if top bit set */
 	MOV	r11, #0x13
 	AND	r12, r11, r7, ASR #31
 	ADDS	r0, r0, r12
@@ -3896,7 +3896,7 @@ fe_sq2:
 	BFC	r7, #31, #1
 	ADCS	r6, r6, #0x0
 	ADC	r7, r7, #0x0
-	# Double
+	/* Double */
 	ADDS	r0, r0, r0
 	ADCS	r1, r1, r1
 	ADCS	r2, r2, r2
@@ -3905,7 +3905,7 @@ fe_sq2:
 	ADCS	r5, r5, r5
 	ADCS	r6, r6, r6
 	ADC	r7, r7, r7
-	# Reduce if top bit set
+	/* Reduce if top bit set */
 	MOV	r11, #0x13
 	AND	r12, r11, r7, ASR #31
 	ADDS	r0, r0, r12
@@ -3918,12 +3918,12 @@ fe_sq2:
 	ADCS	r6, r6, #0x0
 	ADC	r7, r7, #0x0
 	POP	{r12, lr}
-	# Store
+	/* Store */
 	STM	r12, {r0, r1, r2, r3, r4, r5, r6, r7}
 	MOV	r0, r12
 	MOV	r1, lr
 	POP	{pc}
-	# Cycle Count = 213
+	/* Cycle Count = 213 */
 	.size	fe_sq2,.-fe_sq2
 #endif /* WOLFSSL_SP_NO_UMAAL */
 	.text
@@ -3933,7 +3933,7 @@ fe_sq2:
 fe_pow22523:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	SUB	sp, sp, #0x68
-	# pow22523
+	/* pow22523 */
 	STR	r0, [sp, #96]
 	STR	r1, [sp, #100]
 	LDR	r1, [sp, #100]
@@ -4115,7 +4115,7 @@ L_fe_pow22523_8:
 	LDR	r0, [sp, #96]
 	ADD	sp, sp, #0x68
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 293
+	/* Cycle Count = 293 */
 	.size	fe_pow22523,.-fe_pow22523
 	.text
 	.align	4
@@ -4142,7 +4142,7 @@ ge_p1p1_to_p2:
 	BL	fe_mul_op
 	ADD	sp, sp, #0x8
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 53
+	/* Cycle Count = 53 */
 	.size	ge_p1p1_to_p2,.-ge_p1p1_to_p2
 	.text
 	.align	4
@@ -4174,7 +4174,7 @@ ge_p1p1_to_p3:
 	BL	fe_mul_op
 	ADD	sp, sp, #0x8
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 63
+	/* Cycle Count = 63 */
 	.size	ge_p1p1_to_p3,.-ge_p1p1_to_p3
 	.text
 	.align	4
@@ -4218,7 +4218,7 @@ ge_p2_dbl:
 	BL	fe_sub_op
 	ADD	sp, sp, #0x8
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 87
+	/* Cycle Count = 87 */
 	.size	ge_p2_dbl,.-ge_p2_dbl
 	.text
 	.align	4
@@ -4264,7 +4264,7 @@ ge_madd:
 	LDR	r1, [sp, #4]
 	ADD	r1, r1, #0x40
 	ADD	r0, r0, #0x20
-	# Double
+	/* Double */
 	LDM	r1, {r4, r5, r6, r7, r8, r9, r10, r11}
 	ADDS	r4, r4, r4
 	ADCS	r5, r5, r5
@@ -4290,13 +4290,13 @@ ge_madd:
 	ADCS	r10, r10, #0x0
 	ADC	r11, r11, #0x0
 	STM	r0, {r4, r5, r6, r7, r8, r9, r10, r11}
-	# Done Double
+	/* Done Double */
 	ADD	r3, r0, #0x20
 	ADD	r1, r0, #0x20
 	BL	fe_add_sub_op
 	ADD	sp, sp, #0xc
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 136
+	/* Cycle Count = 136 */
 	.size	ge_madd,.-ge_madd
 	.text
 	.align	4
@@ -4342,7 +4342,7 @@ ge_msub:
 	LDR	r1, [sp, #4]
 	ADD	r1, r1, #0x40
 	ADD	r0, r0, #0x20
-	# Double
+	/* Double */
 	LDM	r1, {r4, r5, r6, r7, r8, r9, r10, r11}
 	ADDS	r4, r4, r4
 	ADCS	r5, r5, r5
@@ -4368,14 +4368,14 @@ ge_msub:
 	ADCS	r10, r10, #0x0
 	ADC	r11, r11, #0x0
 	STM	r0, {r4, r5, r6, r7, r8, r9, r10, r11}
-	# Done Double
+	/* Done Double */
 	ADD	r3, r0, #0x20
 	MOV	r1, r0
 	ADD	r0, r0, #0x20
 	BL	fe_add_sub_op
 	ADD	sp, sp, #0xc
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 137
+	/* Cycle Count = 137 */
 	.size	ge_msub,.-ge_msub
 	.text
 	.align	4
@@ -4416,7 +4416,7 @@ ge_add:
 	BL	fe_mul_op
 	LDR	r1, [sp]
 	ADD	r0, sp, #0xc
-	# Double
+	/* Double */
 	LDM	r1, {r4, r5, r6, r7, r8, r9, r10, r11}
 	ADDS	r4, r4, r4
 	ADCS	r5, r5, r5
@@ -4442,7 +4442,7 @@ ge_add:
 	ADCS	r10, r10, #0x0
 	ADC	r11, r11, #0x0
 	STM	r0, {r4, r5, r6, r7, r8, r9, r10, r11}
-	# Done Double
+	/* Done Double */
 	ADD	r3, r1, #0x20
 	ADD	r2, r1, #0x40
 	ADD	r0, r1, #0x20
@@ -4454,7 +4454,7 @@ ge_add:
 	BL	fe_add_sub_op
 	ADD	sp, sp, #0x2c
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 138
+	/* Cycle Count = 138 */
 	.size	ge_add,.-ge_add
 	.text
 	.align	4
@@ -4495,7 +4495,7 @@ ge_sub:
 	BL	fe_mul_op
 	LDR	r1, [sp]
 	ADD	r0, sp, #0xc
-	# Double
+	/* Double */
 	LDM	r1, {r4, r5, r6, r7, r8, r9, r10, r11}
 	ADDS	r4, r4, r4
 	ADCS	r5, r5, r5
@@ -4521,7 +4521,7 @@ ge_sub:
 	ADCS	r10, r10, #0x0
 	ADC	r11, r11, #0x0
 	STM	r0, {r4, r5, r6, r7, r8, r9, r10, r11}
-	# Done Double
+	/* Done Double */
 	ADD	r3, r1, #0x20
 	ADD	r2, r1, #0x40
 	ADD	r0, r1, #0x20
@@ -4533,7 +4533,7 @@ ge_sub:
 	BL	fe_add_sub_op
 	ADD	sp, sp, #0x2c
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 138
+	/* Cycle Count = 138 */
 	.size	ge_sub,.-ge_sub
 #ifdef WOLFSSL_SP_NO_UMAAL
 	.text
@@ -4544,7 +4544,7 @@ sc_reduce:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	SUB	sp, sp, #0x38
 	STR	r0, [sp, #52]
-	# Load bits 252-511
+	/* Load bits 252-511 */
 	ADD	r0, r0, #0x1c
 	LDM	r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9}
 	LSR	lr, r9, #24
@@ -4566,7 +4566,7 @@ sc_reduce:
 	ORR	r2, r2, r1, LSR #28
 	BFC	r9, #28, #4
 	SUB	r0, r0, #0x1c
-	# Add order times bits 504..511
+	/* Add order times bits 504..511 */
 	MOV	r10, #0x2c13
 	MOVT	r10, #0xa30a
 	MOV	r11, #0x9ce5
@@ -4597,7 +4597,7 @@ sc_reduce:
 	SBCS	r7, r7, #0x0
 	SBCS	r8, r8, #0x0
 	SBC	r9, r9, #0x0
-	# Sub product of top 8 words and order
+	/* Sub product of top 8 words and order */
 	MOV	r12, sp
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
@@ -4765,7 +4765,7 @@ sc_reduce:
 	UMLAL	r11, lr, r9, r1
 	STM	r12!, {r10, r11, lr}
 	SUB	r12, r12, #0x20
-	# Subtract at 4 * 32
+	/* Subtract at 4 * 32 */
 	LDM	r12, {r10, r11}
 	SUBS	r10, r10, r2
 	SBCS	r11, r11, r3
@@ -4784,7 +4784,7 @@ sc_reduce:
 	STM	r12!, {r10, r11}
 	SUB	r12, r12, #0x24
 	ASR	lr, r11, #25
-	# Conditionally subtract order starting at bit 125
+	/* Conditionally subtract order starting at bit 125 */
 	MOV	r1, #0xa0000000
 	MOV	r2, #0xba7d
 	MOVT	r2, #0x4b9e
@@ -4822,7 +4822,7 @@ sc_reduce:
 	STM	r12!, {r10}
 	SUB	r0, r0, #0x10
 	MOV	r12, sp
-	# Load bits 252-376
+	/* Load bits 252-376 */
 	ADD	r12, r12, #0x1c
 	LDM	r12, {r1, r2, r3, r4, r5}
 	LSL	r5, r5, #4
@@ -4835,9 +4835,9 @@ sc_reduce:
 	ORR	r2, r2, r1, LSR #28
 	BFC	r5, #29, #3
 	SUB	r12, r12, #0x1c
-	# Sub product of top 4 words and order
+	/* Sub product of top 4 words and order */
 	MOV	r0, sp
-	#   * -5cf5d3ed
+	/*   * -5cf5d3ed */
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
 	MOV	lr, #0x0
@@ -4857,7 +4857,7 @@ sc_reduce:
 	UMLAL	r9, lr, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -5812631b
+	/*   * -5812631b */
 	MOV	r1, #0x9ce5
 	MOVT	r1, #0xa7ed
 	MOV	r10, #0x0
@@ -4877,7 +4877,7 @@ sc_reduce:
 	UMLAL	r9, r10, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -a2f79cd7
+	/*   * -a2f79cd7 */
 	MOV	r1, #0x6329
 	MOVT	r1, #0x5d08
 	MOV	r11, #0x0
@@ -4897,7 +4897,7 @@ sc_reduce:
 	UMLAL	r9, r11, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -14def9df
+	/*   * -14def9df */
 	MOV	r1, #0x621
 	MOVT	r1, #0xeb21
 	MOV	r12, #0x0
@@ -4917,14 +4917,14 @@ sc_reduce:
 	UMLAL	r9, r12, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	# Add overflows at 4 * 32
+	/* Add overflows at 4 * 32 */
 	LDM	r0, {r6, r7, r8, r9}
 	BFC	r9, #28, #4
 	ADDS	r6, r6, lr
 	ADCS	r7, r7, r10
 	ADCS	r8, r8, r11
 	ADC	r9, r9, r12
-	# Subtract top at 4 * 32
+	/* Subtract top at 4 * 32 */
 	SUBS	r6, r6, r2
 	SBCS	r7, r7, r3
 	SBCS	r8, r8, r4
@@ -4954,12 +4954,12 @@ sc_reduce:
 	ADCS	r8, r8, #0x0
 	ADC	r9, r9, r1
 	BFC	r9, #28, #4
-	# Store result
+	/* Store result */
 	LDR	r0, [sp, #52]
 	STM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	ADD	sp, sp, #0x38
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 588
+	/* Cycle Count = 588 */
 	.size	sc_reduce,.-sc_reduce
 #else
 	.text
@@ -4970,7 +4970,7 @@ sc_reduce:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	SUB	sp, sp, #0x38
 	STR	r0, [sp, #52]
-	# Load bits 252-511
+	/* Load bits 252-511 */
 	ADD	r0, r0, #0x1c
 	LDM	r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9}
 	LSR	lr, r9, #24
@@ -4992,7 +4992,7 @@ sc_reduce:
 	ORR	r2, r2, r1, LSR #28
 	BFC	r9, #28, #4
 	SUB	r0, r0, #0x1c
-	# Add order times bits 504..511
+	/* Add order times bits 504..511 */
 	MOV	r10, #0x2c13
 	MOVT	r10, #0xa30a
 	MOV	r11, #0x9ce5
@@ -5014,7 +5014,7 @@ sc_reduce:
 	SBCS	r7, r7, #0x0
 	SBCS	r8, r8, #0x0
 	SBC	r9, r9, #0x0
-	# Sub product of top 8 words and order
+	/* Sub product of top 8 words and order */
 	MOV	r12, sp
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
@@ -5098,7 +5098,7 @@ sc_reduce:
 	UMAAL	r11, lr, r9, r1
 	STM	r12!, {r10, r11, lr}
 	SUB	r12, r12, #0x20
-	# Subtract at 4 * 32
+	/* Subtract at 4 * 32 */
 	LDM	r12, {r10, r11}
 	SUBS	r10, r10, r2
 	SBCS	r11, r11, r3
@@ -5117,7 +5117,7 @@ sc_reduce:
 	STM	r12!, {r10, r11}
 	SUB	r12, r12, #0x24
 	ASR	lr, r11, #25
-	# Conditionally subtract order starting at bit 125
+	/* Conditionally subtract order starting at bit 125 */
 	MOV	r1, #0xa0000000
 	MOV	r2, #0xba7d
 	MOVT	r2, #0x4b9e
@@ -5155,7 +5155,7 @@ sc_reduce:
 	STM	r12!, {r10}
 	SUB	r0, r0, #0x10
 	MOV	r12, sp
-	# Load bits 252-376
+	/* Load bits 252-376 */
 	ADD	r12, r12, #0x1c
 	LDM	r12, {r1, r2, r3, r4, r5}
 	LSL	r5, r5, #4
@@ -5168,9 +5168,9 @@ sc_reduce:
 	ORR	r2, r2, r1, LSR #28
 	BFC	r5, #29, #3
 	SUB	r12, r12, #0x1c
-	# Sub product of top 4 words and order
+	/* Sub product of top 4 words and order */
 	MOV	r0, sp
-	#   * -5cf5d3ed
+	/*   * -5cf5d3ed */
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
 	MOV	lr, #0x0
@@ -5181,7 +5181,7 @@ sc_reduce:
 	UMAAL	r9, lr, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -5812631b
+	/*   * -5812631b */
 	MOV	r1, #0x9ce5
 	MOVT	r1, #0xa7ed
 	MOV	r10, #0x0
@@ -5192,7 +5192,7 @@ sc_reduce:
 	UMAAL	r9, r10, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -a2f79cd7
+	/*   * -a2f79cd7 */
 	MOV	r1, #0x6329
 	MOVT	r1, #0x5d08
 	MOV	r11, #0x0
@@ -5203,7 +5203,7 @@ sc_reduce:
 	UMAAL	r9, r11, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -14def9df
+	/*   * -14def9df */
 	MOV	r1, #0x621
 	MOVT	r1, #0xeb21
 	MOV	r12, #0x0
@@ -5214,14 +5214,14 @@ sc_reduce:
 	UMAAL	r9, r12, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	# Add overflows at 4 * 32
+	/* Add overflows at 4 * 32 */
 	LDM	r0, {r6, r7, r8, r9}
 	BFC	r9, #28, #4
 	ADDS	r6, r6, lr
 	ADCS	r7, r7, r10
 	ADCS	r8, r8, r11
 	ADC	r9, r9, r12
-	# Subtract top at 4 * 32
+	/* Subtract top at 4 * 32 */
 	SUBS	r6, r6, r2
 	SBCS	r7, r7, r3
 	SBCS	r8, r8, r4
@@ -5251,12 +5251,12 @@ sc_reduce:
 	ADCS	r8, r8, #0x0
 	ADC	r9, r9, r1
 	BFC	r9, #28, #4
-	# Store result
+	/* Store result */
 	LDR	r0, [sp, #52]
 	STM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	ADD	sp, sp, #0x38
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 502
+	/* Cycle Count = 502 */
 	.size	sc_reduce,.-sc_reduce
 #endif /* WOLFSSL_SP_NO_UMAAL */
 #ifdef HAVE_ED25519_SIGN
@@ -5272,332 +5272,332 @@ sc_muladd:
 	STM	lr, {r0, r1, r3}
 	MOV	r0, #0x0
 	LDR	r12, [r1]
-	# A[0] * B[0]
+	/* A[0] * B[0] */
 	LDR	lr, [r2]
 	UMULL	r3, r4, r12, lr
-	# A[0] * B[2]
+	/* A[0] * B[2] */
 	LDR	lr, [r2, #8]
 	UMULL	r5, r6, r12, lr
-	# A[0] * B[4]
+	/* A[0] * B[4] */
 	LDR	lr, [r2, #16]
 	UMULL	r7, r8, r12, lr
-	# A[0] * B[6]
+	/* A[0] * B[6] */
 	LDR	lr, [r2, #24]
 	UMULL	r9, r10, r12, lr
 	STR	r3, [sp]
-	# A[0] * B[1]
+	/* A[0] * B[1] */
 	LDR	lr, [r2, #4]
 	MOV	r11, r0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[0] * B[3]
+	/* A[0] * B[3] */
 	LDR	lr, [r2, #12]
 	ADCS	r6, r6, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[0] * B[5]
+	/* A[0] * B[5] */
 	LDR	lr, [r2, #20]
 	ADCS	r8, r8, #0x0
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[0] * B[7]
+	/* A[0] * B[7] */
 	LDR	lr, [r2, #28]
 	ADCS	r10, r10, #0x0
 	ADC	r3, r0, #0x0
 	UMLAL	r10, r3, r12, lr
-	# A[1] * B[0]
+	/* A[1] * B[0] */
 	LDR	r12, [r1, #4]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r4, r11, r12, lr
 	STR	r4, [sp, #4]
 	ADDS	r5, r5, r11
-	# A[1] * B[1]
+	/* A[1] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[1] * B[2]
+	/* A[1] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[1] * B[3]
+	/* A[1] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[1] * B[4]
+	/* A[1] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[1] * B[5]
+	/* A[1] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[1] * B[6]
+	/* A[1] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[1] * B[7]
+	/* A[1] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r4, r0, #0x0
 	UMLAL	r3, r4, r12, lr
-	# A[2] * B[0]
+	/* A[2] * B[0] */
 	LDR	r12, [r1, #8]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r5, r11, r12, lr
 	STR	r5, [sp, #8]
 	ADDS	r6, r6, r11
-	# A[2] * B[1]
+	/* A[2] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[2] * B[2]
+	/* A[2] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[2] * B[3]
+	/* A[2] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[2] * B[4]
+	/* A[2] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[2] * B[5]
+	/* A[2] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[2] * B[6]
+	/* A[2] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[2] * B[7]
+	/* A[2] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r5, r0, #0x0
 	UMLAL	r4, r5, r12, lr
-	# A[3] * B[0]
+	/* A[3] * B[0] */
 	LDR	r12, [r1, #12]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r6, r11, r12, lr
 	STR	r6, [sp, #12]
 	ADDS	r7, r7, r11
-	# A[3] * B[1]
+	/* A[3] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[3] * B[2]
+	/* A[3] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[3] * B[3]
+	/* A[3] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[3] * B[4]
+	/* A[3] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[3] * B[5]
+	/* A[3] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[3] * B[6]
+	/* A[3] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[3] * B[7]
+	/* A[3] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r6, r0, #0x0
 	UMLAL	r5, r6, r12, lr
-	# A[4] * B[0]
+	/* A[4] * B[0] */
 	LDR	r12, [r1, #16]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r7, r11, r12, lr
 	STR	r7, [sp, #16]
 	ADDS	r8, r8, r11
-	# A[4] * B[1]
+	/* A[4] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[4] * B[2]
+	/* A[4] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[4] * B[3]
+	/* A[4] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[4] * B[4]
+	/* A[4] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[4] * B[5]
+	/* A[4] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[4] * B[6]
+	/* A[4] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[4] * B[7]
+	/* A[4] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r7, r0, #0x0
 	UMLAL	r6, r7, r12, lr
-	# A[5] * B[0]
+	/* A[5] * B[0] */
 	LDR	r12, [r1, #20]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r8, r11, r12, lr
 	STR	r8, [sp, #20]
 	ADDS	r9, r9, r11
-	# A[5] * B[1]
+	/* A[5] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r9, r11, r12, lr
 	ADDS	r10, r10, r11
-	# A[5] * B[2]
+	/* A[5] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[5] * B[3]
+	/* A[5] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[5] * B[4]
+	/* A[5] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[5] * B[5]
+	/* A[5] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[5] * B[6]
+	/* A[5] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[5] * B[7]
+	/* A[5] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r8, r0, #0x0
 	UMLAL	r7, r8, r12, lr
-	# A[6] * B[0]
+	/* A[6] * B[0] */
 	LDR	r12, [r1, #24]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r9, r11, r12, lr
 	STR	r9, [sp, #24]
 	ADDS	r10, r10, r11
-	# A[6] * B[1]
+	/* A[6] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r10, r11, r12, lr
 	ADDS	r3, r3, r11
-	# A[6] * B[2]
+	/* A[6] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[6] * B[3]
+	/* A[6] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[6] * B[4]
+	/* A[6] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[6] * B[5]
+	/* A[6] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[6] * B[6]
+	/* A[6] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[6] * B[7]
+	/* A[6] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r9, r0, #0x0
 	UMLAL	r8, r9, r12, lr
-	# A[7] * B[0]
+	/* A[7] * B[0] */
 	LDR	r12, [r1, #28]
 	LDR	lr, [r2]
 	MOV	r11, #0x0
 	UMLAL	r10, r11, r12, lr
 	STR	r10, [sp, #28]
 	ADDS	r3, r3, r11
-	# A[7] * B[1]
+	/* A[7] * B[1] */
 	LDR	lr, [r2, #4]
 	ADC	r11, r0, #0x0
 	UMLAL	r3, r11, r12, lr
 	ADDS	r4, r4, r11
-	# A[7] * B[2]
+	/* A[7] * B[2] */
 	LDR	lr, [r2, #8]
 	ADC	r11, r0, #0x0
 	UMLAL	r4, r11, r12, lr
 	ADDS	r5, r5, r11
-	# A[7] * B[3]
+	/* A[7] * B[3] */
 	LDR	lr, [r2, #12]
 	ADC	r11, r0, #0x0
 	UMLAL	r5, r11, r12, lr
 	ADDS	r6, r6, r11
-	# A[7] * B[4]
+	/* A[7] * B[4] */
 	LDR	lr, [r2, #16]
 	ADC	r11, r0, #0x0
 	UMLAL	r6, r11, r12, lr
 	ADDS	r7, r7, r11
-	# A[7] * B[5]
+	/* A[7] * B[5] */
 	LDR	lr, [r2, #20]
 	ADC	r11, r0, #0x0
 	UMLAL	r7, r11, r12, lr
 	ADDS	r8, r8, r11
-	# A[7] * B[6]
+	/* A[7] * B[6] */
 	LDR	lr, [r2, #24]
 	ADC	r11, r0, #0x0
 	UMLAL	r8, r11, r12, lr
 	ADDS	r9, r9, r11
-	# A[7] * B[7]
+	/* A[7] * B[7] */
 	LDR	lr, [r2, #28]
 	ADC	r10, r0, #0x0
 	UMLAL	r9, r10, r12, lr
 	ADD	lr, sp, #0x20
 	STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}
 	MOV	r0, sp
-	# Add c to a * b
+	/* Add c to a * b */
 	LDR	lr, [sp, #76]
 	LDM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	LDM	lr!, {r1, r10, r11, r12}
@@ -5622,7 +5622,7 @@ sc_muladd:
 	ADCS	r8, r8, #0x0
 	ADC	r9, r9, #0x0
 	SUB	r0, r0, #0x20
-	# Get 252..503 and 504..507
+	/* Get 252..503 and 504..507 */
 	LSR	lr, r9, #24
 	LSL	r9, r9, #4
 	ORR	r9, r9, r8, LSR #28
@@ -5641,7 +5641,7 @@ sc_muladd:
 	LSL	r2, r2, #4
 	ORR	r2, r2, r1, LSR #28
 	BFC	r9, #28, #4
-	# Add order times bits 504..507
+	/* Add order times bits 504..507 */
 	MOV	r10, #0x2c13
 	MOVT	r10, #0xa30a
 	MOV	r11, #0x9ce5
@@ -5672,7 +5672,7 @@ sc_muladd:
 	SBCS	r7, r7, #0x0
 	SBCS	r8, r8, #0x0
 	SBC	r9, r9, #0x0
-	# Sub product of top 8 words and order
+	/* Sub product of top 8 words and order */
 	MOV	r12, sp
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
@@ -5840,7 +5840,7 @@ sc_muladd:
 	UMLAL	r11, lr, r9, r1
 	STM	r12!, {r10, r11, lr}
 	SUB	r12, r12, #0x20
-	# Subtract at 4 * 32
+	/* Subtract at 4 * 32 */
 	LDM	r12, {r10, r11}
 	SUBS	r10, r10, r2
 	SBCS	r11, r11, r3
@@ -5859,7 +5859,7 @@ sc_muladd:
 	STM	r12!, {r10, r11}
 	SUB	r12, r12, #0x24
 	ASR	lr, r11, #25
-	# Conditionally subtract order starting at bit 125
+	/* Conditionally subtract order starting at bit 125 */
 	MOV	r1, #0xa0000000
 	MOV	r2, #0xba7d
 	MOVT	r2, #0x4b9e
@@ -5897,7 +5897,7 @@ sc_muladd:
 	STM	r12!, {r10}
 	SUB	r0, r0, #0x10
 	MOV	r12, sp
-	# Load bits 252-376
+	/* Load bits 252-376 */
 	ADD	r12, r12, #0x1c
 	LDM	r12, {r1, r2, r3, r4, r5}
 	LSL	r5, r5, #4
@@ -5910,9 +5910,9 @@ sc_muladd:
 	ORR	r2, r2, r1, LSR #28
 	BFC	r5, #29, #3
 	SUB	r12, r12, #0x1c
-	# Sub product of top 4 words and order
+	/* Sub product of top 4 words and order */
 	MOV	r0, sp
-	#   * -5cf5d3ed
+	/*   * -5cf5d3ed */
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
 	MOV	lr, #0x0
@@ -5932,7 +5932,7 @@ sc_muladd:
 	UMLAL	r9, lr, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -5812631b
+	/*   * -5812631b */
 	MOV	r1, #0x9ce5
 	MOVT	r1, #0xa7ed
 	MOV	r10, #0x0
@@ -5952,7 +5952,7 @@ sc_muladd:
 	UMLAL	r9, r10, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -a2f79cd7
+	/*   * -a2f79cd7 */
 	MOV	r1, #0x6329
 	MOVT	r1, #0x5d08
 	MOV	r11, #0x0
@@ -5972,7 +5972,7 @@ sc_muladd:
 	UMLAL	r9, r11, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -14def9df
+	/*   * -14def9df */
 	MOV	r1, #0x621
 	MOVT	r1, #0xeb21
 	MOV	r12, #0x0
@@ -5992,14 +5992,14 @@ sc_muladd:
 	UMLAL	r9, r12, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	# Add overflows at 4 * 32
+	/* Add overflows at 4 * 32 */
 	LDM	r0, {r6, r7, r8, r9}
 	BFC	r9, #28, #4
 	ADDS	r6, r6, lr
 	ADCS	r7, r7, r10
 	ADCS	r8, r8, r11
 	ADC	r9, r9, r12
-	# Subtract top at 4 * 32
+	/* Subtract top at 4 * 32 */
 	SUBS	r6, r6, r2
 	SBCS	r7, r7, r3
 	SBCS	r8, r8, r4
@@ -6030,7 +6030,7 @@ sc_muladd:
 	ADC	r9, r9, r1
 	BFC	r9, #28, #4
 	LDR	r0, [sp, #68]
-	# Store result
+	/* Store result */
 	STR	r2, [r0]
 	STR	r3, [r0, #4]
 	STR	r4, [r0, #8]
@@ -6041,7 +6041,7 @@ sc_muladd:
 	STR	r9, [r0, #28]
 	ADD	sp, sp, #0x50
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 994
+	/* Cycle Count = 994 */
 	.size	sc_muladd,.-sc_muladd
 #else
 	.text
@@ -6153,7 +6153,7 @@ sc_muladd:
 	ADD	lr, sp, #0x20
 	STM	lr, {r3, r4, r5, r6, r7, r8, r9, r10}
 	MOV	r0, sp
-	# Add c to a * b
+	/* Add c to a * b */
 	LDR	lr, [sp, #76]
 	LDM	r0, {r2, r3, r4, r5, r6, r7, r8, r9}
 	LDM	lr!, {r1, r10, r11, r12}
@@ -6178,7 +6178,7 @@ sc_muladd:
 	ADCS	r8, r8, #0x0
 	ADC	r9, r9, #0x0
 	SUB	r0, r0, #0x20
-	# Get 252..503 and 504..507
+	/* Get 252..503 and 504..507 */
 	LSR	lr, r9, #24
 	LSL	r9, r9, #4
 	ORR	r9, r9, r8, LSR #28
@@ -6197,7 +6197,7 @@ sc_muladd:
 	LSL	r2, r2, #4
 	ORR	r2, r2, r1, LSR #28
 	BFC	r9, #28, #4
-	# Add order times bits 504..507
+	/* Add order times bits 504..507 */
 	MOV	r10, #0x2c13
 	MOVT	r10, #0xa30a
 	MOV	r11, #0x9ce5
@@ -6219,7 +6219,7 @@ sc_muladd:
 	SBCS	r7, r7, #0x0
 	SBCS	r8, r8, #0x0
 	SBC	r9, r9, #0x0
-	# Sub product of top 8 words and order
+	/* Sub product of top 8 words and order */
 	MOV	r12, sp
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
@@ -6303,7 +6303,7 @@ sc_muladd:
 	UMAAL	r11, lr, r9, r1
 	STM	r12!, {r10, r11, lr}
 	SUB	r12, r12, #0x20
-	# Subtract at 4 * 32
+	/* Subtract at 4 * 32 */
 	LDM	r12, {r10, r11}
 	SUBS	r10, r10, r2
 	SBCS	r11, r11, r3
@@ -6322,7 +6322,7 @@ sc_muladd:
 	STM	r12!, {r10, r11}
 	SUB	r12, r12, #0x24
 	ASR	lr, r11, #25
-	# Conditionally subtract order starting at bit 125
+	/* Conditionally subtract order starting at bit 125 */
 	MOV	r1, #0xa0000000
 	MOV	r2, #0xba7d
 	MOVT	r2, #0x4b9e
@@ -6360,7 +6360,7 @@ sc_muladd:
 	STM	r12!, {r10}
 	SUB	r0, r0, #0x10
 	MOV	r12, sp
-	# Load bits 252-376
+	/* Load bits 252-376 */
 	ADD	r12, r12, #0x1c
 	LDM	r12, {r1, r2, r3, r4, r5}
 	LSL	r5, r5, #4
@@ -6373,9 +6373,9 @@ sc_muladd:
 	ORR	r2, r2, r1, LSR #28
 	BFC	r5, #29, #3
 	SUB	r12, r12, #0x1c
-	# Sub product of top 4 words and order
+	/* Sub product of top 4 words and order */
 	MOV	r0, sp
-	#   * -5cf5d3ed
+	/*   * -5cf5d3ed */
 	MOV	r1, #0x2c13
 	MOVT	r1, #0xa30a
 	MOV	lr, #0x0
@@ -6386,7 +6386,7 @@ sc_muladd:
 	UMAAL	r9, lr, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -5812631b
+	/*   * -5812631b */
 	MOV	r1, #0x9ce5
 	MOVT	r1, #0xa7ed
 	MOV	r10, #0x0
@@ -6397,7 +6397,7 @@ sc_muladd:
 	UMAAL	r9, r10, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -a2f79cd7
+	/*   * -a2f79cd7 */
 	MOV	r1, #0x6329
 	MOVT	r1, #0x5d08
 	MOV	r11, #0x0
@@ -6408,7 +6408,7 @@ sc_muladd:
 	UMAAL	r9, r11, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	#   * -14def9df
+	/*   * -14def9df */
 	MOV	r1, #0x621
 	MOVT	r1, #0xeb21
 	MOV	r12, #0x0
@@ -6419,14 +6419,14 @@ sc_muladd:
 	UMAAL	r9, r12, r5, r1
 	STM	r0, {r6, r7, r8, r9}
 	ADD	r0, r0, #0x4
-	# Add overflows at 4 * 32
+	/* Add overflows at 4 * 32 */
 	LDM	r0, {r6, r7, r8, r9}
 	BFC	r9, #28, #4
 	ADDS	r6, r6, lr
 	ADCS	r7, r7, r10
 	ADCS	r8, r8, r11
 	ADC	r9, r9, r12
-	# Subtract top at 4 * 32
+	/* Subtract top at 4 * 32 */
 	SUBS	r6, r6, r2
 	SBCS	r7, r7, r3
 	SBCS	r8, r8, r4
@@ -6457,7 +6457,7 @@ sc_muladd:
 	ADC	r9, r9, r1
 	BFC	r9, #28, #4
 	LDR	r0, [sp, #68]
-	# Store result
+	/* Store result */
 	STR	r2, [r0]
 	STR	r3, [r0, #4]
 	STR	r4, [r0, #8]
@@ -6468,7 +6468,7 @@ sc_muladd:
 	STR	r9, [r0, #28]
 	ADD	sp, sp, #0x50
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 752
+	/* Cycle Count = 752 */
 	.size	sc_muladd,.-sc_muladd
 #endif /* WOLFSSL_SP_NO_UMAAL */
 #endif /* HAVE_ED25519_SIGN */
diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c
index edb2af0683..a5403e99ed 100644
--- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c
@@ -39,7 +39,7 @@
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef WOLFSSL_ARMASM
-#if !defined(__aarch64__) && defined(__arm__)
+#if !defined(__aarch64__) && defined(__thumb__)
 
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
@@ -2796,9 +2796,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "MOV	%[a], #0x1c\n\t"
         "STR	%[a], [sp, #176]\n\t"
         "\n"
-    "L_curve25519_words_%=:\n\t"
+    "L_curve25519_words:\n\t"
         "\n"
-    "L_curve25519_bits_%=:\n\t"
+    "L_curve25519_bits:\n\t"
         "LDR	%[n], [sp, #164]\n\t"
         "LDR	%[a], [%[n], r2]\n\t"
         "LDR	%[n], [sp, #180]\n\t"
@@ -2978,19 +2978,19 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "LDR	%[n], [sp, #180]\n\t"
         "SUBS	%[n], %[n], #0x1\n\t"
         "STR	%[n], [sp, #180]\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGE	L_curve25519_bits_%=\n\t"
+#ifdef __GNUC__
+        "BGE	L_curve25519_bits\n\t"
 #else
-        "BGE.N	L_curve25519_bits_%=\n\t"
+        "BGE.W	L_curve25519_bits\n\t"
 #endif
         "MOV	%[n], #0x1f\n\t"
         "STR	%[n], [sp, #180]\n\t"
         "SUBS	%[a], %[a], #0x4\n\t"
         "STR	%[a], [sp, #176]\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGE	L_curve25519_words_%=\n\t"
+#ifdef __GNUC__
+        "BGE	L_curve25519_words\n\t"
 #else
-        "BGE.N	L_curve25519_words_%=\n\t"
+        "BGE.W	L_curve25519_words\n\t"
 #endif
         /* Invert */
         "ADD	r1, sp, #0x0\n\t"
@@ -3022,7 +3022,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x4\n\t"
         "\n"
-    "L_curve25519_inv_1_%=:\n\t"
+    "L_curve25519_inv_1:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3030,9 +3030,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_1_%=\n\t"
+        "BNE	L_curve25519_inv_1\n\t"
 #else
-        "BNE.N	L_curve25519_inv_1_%=\n\t"
+        "BNE.N	L_curve25519_inv_1\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3043,7 +3043,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x9\n\t"
         "\n"
-    "L_curve25519_inv_2_%=:\n\t"
+    "L_curve25519_inv_2:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3051,9 +3051,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_2_%=\n\t"
+        "BNE	L_curve25519_inv_2\n\t"
 #else
-        "BNE.N	L_curve25519_inv_2_%=\n\t"
+        "BNE.N	L_curve25519_inv_2\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3064,7 +3064,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x13\n\t"
         "\n"
-    "L_curve25519_inv_3_%=:\n\t"
+    "L_curve25519_inv_3:\n\t"
         "ADD	r1, sp, #0x80\n\t"
         "ADD	r0, sp, #0x80\n\t"
         "PUSH	{r12}\n\t"
@@ -3072,9 +3072,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_3_%=\n\t"
+        "BNE	L_curve25519_inv_3\n\t"
 #else
-        "BNE.N	L_curve25519_inv_3_%=\n\t"
+        "BNE.N	L_curve25519_inv_3\n\t"
 #endif
         "ADD	r2, sp, #0x60\n\t"
         "ADD	r1, sp, #0x80\n\t"
@@ -3082,7 +3082,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0xa\n\t"
         "\n"
-    "L_curve25519_inv_4_%=:\n\t"
+    "L_curve25519_inv_4:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3090,9 +3090,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_4_%=\n\t"
+        "BNE	L_curve25519_inv_4\n\t"
 #else
-        "BNE.N	L_curve25519_inv_4_%=\n\t"
+        "BNE.N	L_curve25519_inv_4\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3103,7 +3103,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x31\n\t"
         "\n"
-    "L_curve25519_inv_5_%=:\n\t"
+    "L_curve25519_inv_5:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3111,9 +3111,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_5_%=\n\t"
+        "BNE	L_curve25519_inv_5\n\t"
 #else
-        "BNE.N	L_curve25519_inv_5_%=\n\t"
+        "BNE.N	L_curve25519_inv_5\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3124,7 +3124,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x63\n\t"
         "\n"
-    "L_curve25519_inv_6_%=:\n\t"
+    "L_curve25519_inv_6:\n\t"
         "ADD	r1, sp, #0x80\n\t"
         "ADD	r0, sp, #0x80\n\t"
         "PUSH	{r12}\n\t"
@@ -3132,9 +3132,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_6_%=\n\t"
+        "BNE	L_curve25519_inv_6\n\t"
 #else
-        "BNE.N	L_curve25519_inv_6_%=\n\t"
+        "BNE.N	L_curve25519_inv_6\n\t"
 #endif
         "ADD	r2, sp, #0x60\n\t"
         "ADD	r1, sp, #0x80\n\t"
@@ -3142,7 +3142,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x32\n\t"
         "\n"
-    "L_curve25519_inv_7_%=:\n\t"
+    "L_curve25519_inv_7:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3150,9 +3150,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_7_%=\n\t"
+        "BNE	L_curve25519_inv_7\n\t"
 #else
-        "BNE.N	L_curve25519_inv_7_%=\n\t"
+        "BNE.N	L_curve25519_inv_7\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3160,7 +3160,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x5\n\t"
         "\n"
-    "L_curve25519_inv_8_%=:\n\t"
+    "L_curve25519_inv_8:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -3168,9 +3168,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_8_%=\n\t"
+        "BNE	L_curve25519_inv_8\n\t"
 #else
-        "BNE.N	L_curve25519_inv_8_%=\n\t"
+        "BNE.N	L_curve25519_inv_8\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -3234,7 +3234,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "STM	r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
         "MOV	%[a], #0xfe\n\t"
         "\n"
-    "L_curve25519_bits_%=:\n\t"
+    "L_curve25519_bits:\n\t"
         "STR	%[a], [sp, #168]\n\t"
         "LDR	%[n], [sp, #160]\n\t"
         "AND	r4, %[a], #0x1f\n\t"
@@ -3320,9 +3320,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "LDR	%[a], [sp, #168]\n\t"
         "SUBS	%[a], %[a], #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGE	L_curve25519_bits_%=\n\t"
+        "BGE	L_curve25519_bits\n\t"
 #else
-        "BGE.N	L_curve25519_bits_%=\n\t"
+        "BGE.N	L_curve25519_bits\n\t"
 #endif
         /*   Cycle Count: 171 */
         "LDR	%[n], [sp, #184]\n\t"
@@ -3359,7 +3359,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x4\n\t"
         "\n"
-    "L_curve25519_inv_1_%=:\n\t"
+    "L_curve25519_inv_1:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3367,9 +3367,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_1_%=\n\t"
+        "BNE	L_curve25519_inv_1\n\t"
 #else
-        "BNE.N	L_curve25519_inv_1_%=\n\t"
+        "BNE.N	L_curve25519_inv_1\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3380,7 +3380,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x9\n\t"
         "\n"
-    "L_curve25519_inv_2_%=:\n\t"
+    "L_curve25519_inv_2:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3388,9 +3388,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_2_%=\n\t"
+        "BNE	L_curve25519_inv_2\n\t"
 #else
-        "BNE.N	L_curve25519_inv_2_%=\n\t"
+        "BNE.N	L_curve25519_inv_2\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3401,7 +3401,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x13\n\t"
         "\n"
-    "L_curve25519_inv_3_%=:\n\t"
+    "L_curve25519_inv_3:\n\t"
         "ADD	r1, sp, #0x80\n\t"
         "ADD	r0, sp, #0x80\n\t"
         "PUSH	{r12}\n\t"
@@ -3409,9 +3409,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_3_%=\n\t"
+        "BNE	L_curve25519_inv_3\n\t"
 #else
-        "BNE.N	L_curve25519_inv_3_%=\n\t"
+        "BNE.N	L_curve25519_inv_3\n\t"
 #endif
         "ADD	r2, sp, #0x60\n\t"
         "ADD	r1, sp, #0x80\n\t"
@@ -3419,7 +3419,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0xa\n\t"
         "\n"
-    "L_curve25519_inv_4_%=:\n\t"
+    "L_curve25519_inv_4:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3427,9 +3427,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_4_%=\n\t"
+        "BNE	L_curve25519_inv_4\n\t"
 #else
-        "BNE.N	L_curve25519_inv_4_%=\n\t"
+        "BNE.N	L_curve25519_inv_4\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3440,7 +3440,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x31\n\t"
         "\n"
-    "L_curve25519_inv_5_%=:\n\t"
+    "L_curve25519_inv_5:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3448,9 +3448,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_5_%=\n\t"
+        "BNE	L_curve25519_inv_5\n\t"
 #else
-        "BNE.N	L_curve25519_inv_5_%=\n\t"
+        "BNE.N	L_curve25519_inv_5\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3461,7 +3461,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x63\n\t"
         "\n"
-    "L_curve25519_inv_6_%=:\n\t"
+    "L_curve25519_inv_6:\n\t"
         "ADD	r1, sp, #0x80\n\t"
         "ADD	r0, sp, #0x80\n\t"
         "PUSH	{r12}\n\t"
@@ -3469,9 +3469,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_6_%=\n\t"
+        "BNE	L_curve25519_inv_6\n\t"
 #else
-        "BNE.N	L_curve25519_inv_6_%=\n\t"
+        "BNE.N	L_curve25519_inv_6\n\t"
 #endif
         "ADD	r2, sp, #0x60\n\t"
         "ADD	r1, sp, #0x80\n\t"
@@ -3479,7 +3479,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x32\n\t"
         "\n"
-    "L_curve25519_inv_7_%=:\n\t"
+    "L_curve25519_inv_7:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3487,9 +3487,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_7_%=\n\t"
+        "BNE	L_curve25519_inv_7\n\t"
 #else
-        "BNE.N	L_curve25519_inv_7_%=\n\t"
+        "BNE.N	L_curve25519_inv_7\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3497,7 +3497,7 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x5\n\t"
         "\n"
-    "L_curve25519_inv_8_%=:\n\t"
+    "L_curve25519_inv_8:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -3505,9 +3505,9 @@ int curve25519(byte* r, const byte* n, const byte* a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_curve25519_inv_8_%=\n\t"
+        "BNE	L_curve25519_inv_8\n\t"
 #else
-        "BNE.N	L_curve25519_inv_8_%=\n\t"
+        "BNE.N	L_curve25519_inv_8\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -3589,7 +3589,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x4\n\t"
         "\n"
-    "L_fe_invert1_%=:\n\t"
+    "L_fe_invert1:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -3597,9 +3597,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert1_%=\n\t"
+        "BNE	L_fe_invert1\n\t"
 #else
-        "BNE.N	L_fe_invert1_%=\n\t"
+        "BNE.N	L_fe_invert1\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -3610,7 +3610,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x9\n\t"
         "\n"
-    "L_fe_invert2_%=:\n\t"
+    "L_fe_invert2:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -3618,9 +3618,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert2_%=\n\t"
+        "BNE	L_fe_invert2\n\t"
 #else
-        "BNE.N	L_fe_invert2_%=\n\t"
+        "BNE.N	L_fe_invert2\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -3631,7 +3631,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x13\n\t"
         "\n"
-    "L_fe_invert3_%=:\n\t"
+    "L_fe_invert3:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3639,9 +3639,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert3_%=\n\t"
+        "BNE	L_fe_invert3\n\t"
 #else
-        "BNE.N	L_fe_invert3_%=\n\t"
+        "BNE.N	L_fe_invert3\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3649,7 +3649,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0xa\n\t"
         "\n"
-    "L_fe_invert4_%=:\n\t"
+    "L_fe_invert4:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -3657,9 +3657,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert4_%=\n\t"
+        "BNE	L_fe_invert4\n\t"
 #else
-        "BNE.N	L_fe_invert4_%=\n\t"
+        "BNE.N	L_fe_invert4\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -3670,7 +3670,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x31\n\t"
         "\n"
-    "L_fe_invert5_%=:\n\t"
+    "L_fe_invert5:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -3678,9 +3678,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert5_%=\n\t"
+        "BNE	L_fe_invert5\n\t"
 #else
-        "BNE.N	L_fe_invert5_%=\n\t"
+        "BNE.N	L_fe_invert5\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -3691,7 +3691,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x63\n\t"
         "\n"
-    "L_fe_invert6_%=:\n\t"
+    "L_fe_invert6:\n\t"
         "ADD	r1, sp, #0x60\n\t"
         "ADD	r0, sp, #0x60\n\t"
         "PUSH	{r12}\n\t"
@@ -3699,9 +3699,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert6_%=\n\t"
+        "BNE	L_fe_invert6\n\t"
 #else
-        "BNE.N	L_fe_invert6_%=\n\t"
+        "BNE.N	L_fe_invert6\n\t"
 #endif
         "ADD	r2, sp, #0x40\n\t"
         "ADD	r1, sp, #0x60\n\t"
@@ -3709,7 +3709,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x32\n\t"
         "\n"
-    "L_fe_invert7_%=:\n\t"
+    "L_fe_invert7:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -3717,9 +3717,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert7_%=\n\t"
+        "BNE	L_fe_invert7\n\t"
 #else
-        "BNE.N	L_fe_invert7_%=\n\t"
+        "BNE.N	L_fe_invert7\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -3727,7 +3727,7 @@ void fe_invert(fe r, const fe a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x5\n\t"
         "\n"
-    "L_fe_invert8_%=:\n\t"
+    "L_fe_invert8:\n\t"
         "ADD	r1, sp, #0x20\n\t"
         "ADD	r0, sp, #0x20\n\t"
         "PUSH	{r12}\n\t"
@@ -3735,9 +3735,9 @@ void fe_invert(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_invert8_%=\n\t"
+        "BNE	L_fe_invert8\n\t"
 #else
-        "BNE.N	L_fe_invert8_%=\n\t"
+        "BNE.N	L_fe_invert8\n\t"
 #endif
         "MOV	r2, sp\n\t"
         "ADD	r1, sp, #0x20\n\t"
@@ -4275,7 +4275,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x4\n\t"
         "\n"
-    "L_fe_pow22523_1_%=:\n\t"
+    "L_fe_pow22523_1:\n\t"
         "ADD	r1, sp, #0x20\n\t"
         "ADD	r0, sp, #0x20\n\t"
         "PUSH	{r12}\n\t"
@@ -4283,9 +4283,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_1_%=\n\t"
+        "BNE	L_fe_pow22523_1\n\t"
 #else
-        "BNE.N	L_fe_pow22523_1_%=\n\t"
+        "BNE.N	L_fe_pow22523_1\n\t"
 #endif
         "MOV	r2, sp\n\t"
         "ADD	r1, sp, #0x20\n\t"
@@ -4296,7 +4296,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x9\n\t"
         "\n"
-    "L_fe_pow22523_2_%=:\n\t"
+    "L_fe_pow22523_2:\n\t"
         "ADD	r1, sp, #0x20\n\t"
         "ADD	r0, sp, #0x20\n\t"
         "PUSH	{r12}\n\t"
@@ -4304,9 +4304,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_2_%=\n\t"
+        "BNE	L_fe_pow22523_2\n\t"
 #else
-        "BNE.N	L_fe_pow22523_2_%=\n\t"
+        "BNE.N	L_fe_pow22523_2\n\t"
 #endif
         "MOV	r2, sp\n\t"
         "ADD	r1, sp, #0x20\n\t"
@@ -4317,7 +4317,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x13\n\t"
         "\n"
-    "L_fe_pow22523_3_%=:\n\t"
+    "L_fe_pow22523_3:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -4325,9 +4325,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_3_%=\n\t"
+        "BNE	L_fe_pow22523_3\n\t"
 #else
-        "BNE.N	L_fe_pow22523_3_%=\n\t"
+        "BNE.N	L_fe_pow22523_3\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -4335,7 +4335,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0xa\n\t"
         "\n"
-    "L_fe_pow22523_4_%=:\n\t"
+    "L_fe_pow22523_4:\n\t"
         "ADD	r1, sp, #0x20\n\t"
         "ADD	r0, sp, #0x20\n\t"
         "PUSH	{r12}\n\t"
@@ -4343,9 +4343,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_4_%=\n\t"
+        "BNE	L_fe_pow22523_4\n\t"
 #else
-        "BNE.N	L_fe_pow22523_4_%=\n\t"
+        "BNE.N	L_fe_pow22523_4\n\t"
 #endif
         "MOV	r2, sp\n\t"
         "ADD	r1, sp, #0x20\n\t"
@@ -4356,7 +4356,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x31\n\t"
         "\n"
-    "L_fe_pow22523_5_%=:\n\t"
+    "L_fe_pow22523_5:\n\t"
         "ADD	r1, sp, #0x20\n\t"
         "ADD	r0, sp, #0x20\n\t"
         "PUSH	{r12}\n\t"
@@ -4364,9 +4364,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_5_%=\n\t"
+        "BNE	L_fe_pow22523_5\n\t"
 #else
-        "BNE.N	L_fe_pow22523_5_%=\n\t"
+        "BNE.N	L_fe_pow22523_5\n\t"
 #endif
         "MOV	r2, sp\n\t"
         "ADD	r1, sp, #0x20\n\t"
@@ -4377,7 +4377,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_sq_op\n\t"
         "MOV	r12, #0x63\n\t"
         "\n"
-    "L_fe_pow22523_6_%=:\n\t"
+    "L_fe_pow22523_6:\n\t"
         "ADD	r1, sp, #0x40\n\t"
         "ADD	r0, sp, #0x40\n\t"
         "PUSH	{r12}\n\t"
@@ -4385,9 +4385,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_6_%=\n\t"
+        "BNE	L_fe_pow22523_6\n\t"
 #else
-        "BNE.N	L_fe_pow22523_6_%=\n\t"
+        "BNE.N	L_fe_pow22523_6\n\t"
 #endif
         "ADD	r2, sp, #0x20\n\t"
         "ADD	r1, sp, #0x40\n\t"
@@ -4395,7 +4395,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x32\n\t"
         "\n"
-    "L_fe_pow22523_7_%=:\n\t"
+    "L_fe_pow22523_7:\n\t"
         "ADD	r1, sp, #0x20\n\t"
         "ADD	r0, sp, #0x20\n\t"
         "PUSH	{r12}\n\t"
@@ -4403,9 +4403,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_7_%=\n\t"
+        "BNE	L_fe_pow22523_7\n\t"
 #else
-        "BNE.N	L_fe_pow22523_7_%=\n\t"
+        "BNE.N	L_fe_pow22523_7\n\t"
 #endif
         "MOV	r2, sp\n\t"
         "ADD	r1, sp, #0x20\n\t"
@@ -4413,7 +4413,7 @@ void fe_pow22523(fe r, const fe a)
         "BL	fe_mul_op\n\t"
         "MOV	r12, #0x2\n\t"
         "\n"
-    "L_fe_pow22523_8_%=:\n\t"
+    "L_fe_pow22523_8:\n\t"
         "MOV	r1, sp\n\t"
         "MOV	r0, sp\n\t"
         "PUSH	{r12}\n\t"
@@ -4421,9 +4421,9 @@ void fe_pow22523(fe r, const fe a)
         "POP	{r12}\n\t"
         "SUBS	r12, r12, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_fe_pow22523_8_%=\n\t"
+        "BNE	L_fe_pow22523_8\n\t"
 #else
-        "BNE.N	L_fe_pow22523_8_%=\n\t"
+        "BNE.N	L_fe_pow22523_8\n\t"
 #endif
         "LDR	r2, [sp, #100]\n\t"
         "MOV	r1, sp\n\t"
@@ -6904,7 +6904,7 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
 #endif /* HAVE_CURVE25519 || HAVE_ED25519 */
 #endif /* !__aarch64__ && __thumb__ */
 #endif /* WOLFSSL_ARMASM */
-#endif /* !defined(__aarch64__) && defined(__arm__) */
+#endif /* !defined(__aarch64__) && defined(__thumb__) */
 #endif /* WOLFSSL_ARMASM */
 
 #endif /* WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S
index 7c59e25482..30d8dc76b5 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S
@@ -113,7 +113,7 @@ Transform_Sha256_Len:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	SUB	sp, sp, #0xc0
 	ADR	r3, L_SHA256_transform_len_k
-	# Copy digest to add in at end
+	/* Copy digest to add in at end */
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
 	LDRD	r8, r9, [r0, #16]
@@ -122,9 +122,9 @@ Transform_Sha256_Len:
 	STRD	r6, r7, [sp, #72]
 	STRD	r8, r9, [sp, #80]
 	STRD	r10, r11, [sp, #88]
-	# Start of loop processing a block
+	/* Start of loop processing a block */
 L_SHA256_transform_len_begin:
-	# Load, Reverse and Store W - 64 bytes
+	/* Load, Reverse and Store W - 64 bytes */
 	LDR	r4, [r1]
 	LDR	r5, [r1, #4]
 	LDR	r6, [r1, #8]
@@ -169,9 +169,9 @@ L_SHA256_transform_len_begin:
 	LDR	r4, [r0, #8]
 	EOR	r11, r11, r4
 	MOV	r12, #0x3
-	# Start of 16 rounds
+	/* Start of 16 rounds */
 L_SHA256_transform_len_start:
-	# Round 0
+	/* Round 0 */
 	LDR	r5, [r0, #16]
 	LDR	r6, [r0, #20]
 	LDR	r7, [r0, #24]
@@ -203,7 +203,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #12]
 	STR	r9, [r0, #28]
-	# Calc new W[0]
+	/* Calc new W[0] */
 	LDR	r6, [sp, #56]
 	LDR	r7, [sp, #36]
 	LDR	r8, [sp, #4]
@@ -218,7 +218,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp]
-	# Round 1
+	/* Round 1 */
 	LDR	r5, [r0, #12]
 	LDR	r6, [r0, #16]
 	LDR	r7, [r0, #20]
@@ -250,7 +250,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #8]
 	STR	r9, [r0, #24]
-	# Calc new W[1]
+	/* Calc new W[1] */
 	LDR	r6, [sp, #60]
 	LDR	r7, [sp, #40]
 	LDR	r8, [sp, #8]
@@ -265,7 +265,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #4]
-	# Round 2
+	/* Round 2 */
 	LDR	r5, [r0, #8]
 	LDR	r6, [r0, #12]
 	LDR	r7, [r0, #16]
@@ -297,7 +297,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #4]
 	STR	r9, [r0, #20]
-	# Calc new W[2]
+	/* Calc new W[2] */
 	LDR	r6, [sp]
 	LDR	r7, [sp, #44]
 	LDR	r8, [sp, #12]
@@ -312,7 +312,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #8]
-	# Round 3
+	/* Round 3 */
 	LDR	r5, [r0, #4]
 	LDR	r6, [r0, #8]
 	LDR	r7, [r0, #12]
@@ -344,7 +344,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0]
 	STR	r9, [r0, #16]
-	# Calc new W[3]
+	/* Calc new W[3] */
 	LDR	r6, [sp, #4]
 	LDR	r7, [sp, #48]
 	LDR	r8, [sp, #16]
@@ -359,7 +359,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #12]
-	# Round 4
+	/* Round 4 */
 	LDR	r5, [r0]
 	LDR	r6, [r0, #4]
 	LDR	r7, [r0, #8]
@@ -391,7 +391,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #28]
 	STR	r9, [r0, #12]
-	# Calc new W[4]
+	/* Calc new W[4] */
 	LDR	r6, [sp, #8]
 	LDR	r7, [sp, #52]
 	LDR	r8, [sp, #20]
@@ -406,7 +406,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #16]
-	# Round 5
+	/* Round 5 */
 	LDR	r5, [r0, #28]
 	LDR	r6, [r0]
 	LDR	r7, [r0, #4]
@@ -438,7 +438,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #24]
 	STR	r9, [r0, #8]
-	# Calc new W[5]
+	/* Calc new W[5] */
 	LDR	r6, [sp, #12]
 	LDR	r7, [sp, #56]
 	LDR	r8, [sp, #24]
@@ -453,7 +453,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #20]
-	# Round 6
+	/* Round 6 */
 	LDR	r5, [r0, #24]
 	LDR	r6, [r0, #28]
 	LDR	r7, [r0]
@@ -485,7 +485,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #20]
 	STR	r9, [r0, #4]
-	# Calc new W[6]
+	/* Calc new W[6] */
 	LDR	r6, [sp, #16]
 	LDR	r7, [sp, #60]
 	LDR	r8, [sp, #28]
@@ -500,7 +500,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #24]
-	# Round 7
+	/* Round 7 */
 	LDR	r5, [r0, #20]
 	LDR	r6, [r0, #24]
 	LDR	r7, [r0, #28]
@@ -532,7 +532,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #16]
 	STR	r9, [r0]
-	# Calc new W[7]
+	/* Calc new W[7] */
 	LDR	r6, [sp, #20]
 	LDR	r7, [sp]
 	LDR	r8, [sp, #32]
@@ -547,7 +547,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #28]
-	# Round 8
+	/* Round 8 */
 	LDR	r5, [r0, #16]
 	LDR	r6, [r0, #20]
 	LDR	r7, [r0, #24]
@@ -579,7 +579,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #12]
 	STR	r9, [r0, #28]
-	# Calc new W[8]
+	/* Calc new W[8] */
 	LDR	r6, [sp, #24]
 	LDR	r7, [sp, #4]
 	LDR	r8, [sp, #36]
@@ -594,7 +594,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #32]
-	# Round 9
+	/* Round 9 */
 	LDR	r5, [r0, #12]
 	LDR	r6, [r0, #16]
 	LDR	r7, [r0, #20]
@@ -626,7 +626,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #8]
 	STR	r9, [r0, #24]
-	# Calc new W[9]
+	/* Calc new W[9] */
 	LDR	r6, [sp, #28]
 	LDR	r7, [sp, #8]
 	LDR	r8, [sp, #40]
@@ -641,7 +641,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #36]
-	# Round 10
+	/* Round 10 */
 	LDR	r5, [r0, #8]
 	LDR	r6, [r0, #12]
 	LDR	r7, [r0, #16]
@@ -673,7 +673,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #4]
 	STR	r9, [r0, #20]
-	# Calc new W[10]
+	/* Calc new W[10] */
 	LDR	r6, [sp, #32]
 	LDR	r7, [sp, #12]
 	LDR	r8, [sp, #44]
@@ -688,7 +688,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #40]
-	# Round 11
+	/* Round 11 */
 	LDR	r5, [r0, #4]
 	LDR	r6, [r0, #8]
 	LDR	r7, [r0, #12]
@@ -720,7 +720,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0]
 	STR	r9, [r0, #16]
-	# Calc new W[11]
+	/* Calc new W[11] */
 	LDR	r6, [sp, #36]
 	LDR	r7, [sp, #16]
 	LDR	r8, [sp, #48]
@@ -735,7 +735,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #44]
-	# Round 12
+	/* Round 12 */
 	LDR	r5, [r0]
 	LDR	r6, [r0, #4]
 	LDR	r7, [r0, #8]
@@ -767,7 +767,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #28]
 	STR	r9, [r0, #12]
-	# Calc new W[12]
+	/* Calc new W[12] */
 	LDR	r6, [sp, #40]
 	LDR	r7, [sp, #20]
 	LDR	r8, [sp, #52]
@@ -782,7 +782,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #48]
-	# Round 13
+	/* Round 13 */
 	LDR	r5, [r0, #28]
 	LDR	r6, [r0]
 	LDR	r7, [r0, #4]
@@ -814,7 +814,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #24]
 	STR	r9, [r0, #8]
-	# Calc new W[13]
+	/* Calc new W[13] */
 	LDR	r6, [sp, #44]
 	LDR	r7, [sp, #24]
 	LDR	r8, [sp, #56]
@@ -829,7 +829,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #52]
-	# Round 14
+	/* Round 14 */
 	LDR	r5, [r0, #24]
 	LDR	r6, [r0, #28]
 	LDR	r7, [r0]
@@ -861,7 +861,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #20]
 	STR	r9, [r0, #4]
-	# Calc new W[14]
+	/* Calc new W[14] */
 	LDR	r6, [sp, #48]
 	LDR	r7, [sp, #28]
 	LDR	r8, [sp, #60]
@@ -876,7 +876,7 @@ L_SHA256_transform_len_start:
 	ADD	r4, r4, r5
 	ADD	r9, r9, r4
 	STR	r9, [sp, #56]
-	# Round 15
+	/* Round 15 */
 	LDR	r5, [r0, #20]
 	LDR	r6, [r0, #24]
 	LDR	r7, [r0, #28]
@@ -908,7 +908,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #16]
 	STR	r9, [r0]
-	# Calc new W[15]
+	/* Calc new W[15] */
 	LDR	r6, [sp, #52]
 	LDR	r7, [sp, #32]
 	LDR	r8, [sp]
@@ -925,12 +925,12 @@ L_SHA256_transform_len_start:
 	STR	r9, [sp, #60]
 	ADD	r3, r3, #0x40
 	SUBS	r12, r12, #0x1
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_SHA256_transform_len_start
 #else
-	BNE.N	L_SHA256_transform_len_start
+	BNE.W	L_SHA256_transform_len_start
 #endif
-	# Round 0
+	/* Round 0 */
 	LDR	r5, [r0, #16]
 	LDR	r6, [r0, #20]
 	LDR	r7, [r0, #24]
@@ -962,7 +962,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #12]
 	STR	r9, [r0, #28]
-	# Round 1
+	/* Round 1 */
 	LDR	r5, [r0, #12]
 	LDR	r6, [r0, #16]
 	LDR	r7, [r0, #20]
@@ -994,7 +994,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #8]
 	STR	r9, [r0, #24]
-	# Round 2
+	/* Round 2 */
 	LDR	r5, [r0, #8]
 	LDR	r6, [r0, #12]
 	LDR	r7, [r0, #16]
@@ -1026,7 +1026,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #4]
 	STR	r9, [r0, #20]
-	# Round 3
+	/* Round 3 */
 	LDR	r5, [r0, #4]
 	LDR	r6, [r0, #8]
 	LDR	r7, [r0, #12]
@@ -1058,7 +1058,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0]
 	STR	r9, [r0, #16]
-	# Round 4
+	/* Round 4 */
 	LDR	r5, [r0]
 	LDR	r6, [r0, #4]
 	LDR	r7, [r0, #8]
@@ -1090,7 +1090,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #28]
 	STR	r9, [r0, #12]
-	# Round 5
+	/* Round 5 */
 	LDR	r5, [r0, #28]
 	LDR	r6, [r0]
 	LDR	r7, [r0, #4]
@@ -1122,7 +1122,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #24]
 	STR	r9, [r0, #8]
-	# Round 6
+	/* Round 6 */
 	LDR	r5, [r0, #24]
 	LDR	r6, [r0, #28]
 	LDR	r7, [r0]
@@ -1154,7 +1154,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #20]
 	STR	r9, [r0, #4]
-	# Round 7
+	/* Round 7 */
 	LDR	r5, [r0, #20]
 	LDR	r6, [r0, #24]
 	LDR	r7, [r0, #28]
@@ -1186,7 +1186,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #16]
 	STR	r9, [r0]
-	# Round 8
+	/* Round 8 */
 	LDR	r5, [r0, #16]
 	LDR	r6, [r0, #20]
 	LDR	r7, [r0, #24]
@@ -1218,7 +1218,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #12]
 	STR	r9, [r0, #28]
-	# Round 9
+	/* Round 9 */
 	LDR	r5, [r0, #12]
 	LDR	r6, [r0, #16]
 	LDR	r7, [r0, #20]
@@ -1250,7 +1250,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #8]
 	STR	r9, [r0, #24]
-	# Round 10
+	/* Round 10 */
 	LDR	r5, [r0, #8]
 	LDR	r6, [r0, #12]
 	LDR	r7, [r0, #16]
@@ -1282,7 +1282,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #4]
 	STR	r9, [r0, #20]
-	# Round 11
+	/* Round 11 */
 	LDR	r5, [r0, #4]
 	LDR	r6, [r0, #8]
 	LDR	r7, [r0, #12]
@@ -1314,7 +1314,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0]
 	STR	r9, [r0, #16]
-	# Round 12
+	/* Round 12 */
 	LDR	r5, [r0]
 	LDR	r6, [r0, #4]
 	LDR	r7, [r0, #8]
@@ -1346,7 +1346,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #28]
 	STR	r9, [r0, #12]
-	# Round 13
+	/* Round 13 */
 	LDR	r5, [r0, #28]
 	LDR	r6, [r0]
 	LDR	r7, [r0, #4]
@@ -1378,7 +1378,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #24]
 	STR	r9, [r0, #8]
-	# Round 14
+	/* Round 14 */
 	LDR	r5, [r0, #24]
 	LDR	r6, [r0, #28]
 	LDR	r7, [r0]
@@ -1410,7 +1410,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r11
 	STR	r8, [r0, #20]
 	STR	r9, [r0, #4]
-	# Round 15
+	/* Round 15 */
 	LDR	r5, [r0, #20]
 	LDR	r6, [r0, #24]
 	LDR	r7, [r0, #28]
@@ -1442,7 +1442,7 @@ L_SHA256_transform_len_start:
 	ADD	r9, r9, r10
 	STR	r8, [r0, #16]
 	STR	r9, [r0]
-	# Add in digest from start
+	/* Add in digest from start */
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
 	LDRD	r8, r9, [sp, #64]
@@ -1470,14 +1470,14 @@ L_SHA256_transform_len_start:
 	SUBS	r2, r2, #0x40
 	SUB	r3, r3, #0xc0
 	ADD	r1, r1, #0x40
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_SHA256_transform_len_begin
 #else
-	BNE.N	L_SHA256_transform_len_begin
+	BNE.W	L_SHA256_transform_len_begin
 #endif
 	ADD	sp, sp, #0xc0
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 1874
+	/* Cycle Count = 1874 */
 	.size	Transform_Sha256_Len,.-Transform_Sha256_Len
 #endif /* WOLFSSL_ARMASM_NO_NEON */
 #endif /* !NO_SHA256 */
diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c
index 2483f036d5..a2367c2a2c 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c
@@ -39,7 +39,7 @@
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef WOLFSSL_ARMASM
-#if !defined(__aarch64__) && defined(__arm__)
+#if !defined(__aarch64__) && defined(__thumb__)
 
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
@@ -84,8 +84,8 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
     register wc_Sha256* sha256 __asm__ ("r0") = (wc_Sha256*)sha256_p;
     register const byte* data __asm__ ("r1") = (const byte*)data_p;
     register word32 len __asm__ ("r2") = (word32)len_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint32_t* L_SHA256_transform_len_k_c __asm__ ("r3") = (uint32_t*)&L_SHA256_transform_len_k;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0xc0\n\t"
@@ -101,7 +101,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "STRD	r10, r11, [sp, #88]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA256_transform_len_begin_%=:\n\t"
+    "L_SHA256_transform_len_begin:\n\t"
         /* Load, Reverse and Store W - 64 bytes */
         "LDR	r4, [%[data]]\n\t"
         "LDR	r5, [%[data], #4]\n\t"
@@ -149,7 +149,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "MOV	r12, #0x3\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA256_transform_len_start_%=:\n\t"
+    "L_SHA256_transform_len_start:\n\t"
         /* Round 0 */
         "LDR	r5, [%[sha256], #16]\n\t"
         "LDR	r6, [%[sha256], #20]\n\t"
@@ -904,10 +904,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "STR	r9, [sp, #60]\n\t"
         "ADD	r3, r3, #0x40\n\t"
         "SUBS	r12, r12, #0x1\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_SHA256_transform_len_start_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_SHA256_transform_len_start\n\t"
 #else
-        "BNE.N	L_SHA256_transform_len_start_%=\n\t"
+        "BNE.W	L_SHA256_transform_len_start\n\t"
 #endif
         /* Round 0 */
         "LDR	r5, [%[sha256], #16]\n\t"
@@ -1449,14 +1449,20 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
         "SUBS	%[len], %[len], #0x40\n\t"
         "SUB	r3, r3, #0xc0\n\t"
         "ADD	%[data], %[data], #0x40\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_SHA256_transform_len_begin_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_SHA256_transform_len_begin\n\t"
 #else
-        "BNE.N	L_SHA256_transform_len_begin_%=\n\t"
+        "BNE.W	L_SHA256_transform_len_begin\n\t"
 #endif
         "ADD	sp, sp, #0xc0\n\t"
-        : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len),
+          [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c)
         :
+#else
+        : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len)
+        : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
     );
 }
@@ -1465,7 +1471,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len)
 #endif /* !NO_SHA256 */
 #endif /* !__aarch64__ && __thumb__ */
 #endif /* WOLFSSL_ARMASM */
-#endif /* !defined(__aarch64__) && defined(__arm__) */
+#endif /* !defined(__aarch64__) && defined(__thumb__) */
 #endif /* WOLFSSL_ARMASM */
 
 #endif /* WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S
index b420e78634..6031b92404 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S
+++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S
@@ -209,7 +209,7 @@ Transform_Sha512_Len:
 	PUSH	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
 	SUB	sp, sp, #0xc0
 	ADR	r3, L_SHA512_transform_len_k
-	# Copy digest to add in at end
+	/* Copy digest to add in at end */
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
 	LDRD	r8, r9, [r0, #16]
@@ -226,9 +226,9 @@ Transform_Sha512_Len:
 	STRD	r6, r7, [sp, #168]
 	STRD	r8, r9, [sp, #176]
 	STRD	r10, r11, [sp, #184]
-	# Start of loop processing a block
+	/* Start of loop processing a block */
 L_SHA512_transform_len_begin:
-	# Load, Reverse and Store W
+	/* Load, Reverse and Store W */
 	LDR	r4, [r1]
 	LDR	r5, [r1, #4]
 	LDR	r6, [r1, #8]
@@ -325,15 +325,15 @@ L_SHA512_transform_len_begin:
 	STR	r8, [sp, #116]
 	STR	r11, [sp, #120]
 	STR	r10, [sp, #124]
-	# Pre-calc: b ^ c
+	/* Pre-calc: b ^ c */
 	LDRD	r10, r11, [r0, #8]
 	LDRD	r4, r5, [r0, #16]
 	EOR	r10, r10, r4
 	EOR	r11, r11, r5
 	MOV	r12, #0x4
-	# Start of 16 rounds
+	/* Start of 16 rounds */
 L_SHA512_transform_len_start:
-	# Round 0
+	/* Round 0 */
 	LDRD	r4, r5, [r0, #32]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -413,7 +413,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #56]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[0]
+	/* Calc new W[0] */
 	LDRD	r4, r5, [sp, #112]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -457,7 +457,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp]
-	# Round 1
+	/* Round 1 */
 	LDRD	r4, r5, [r0, #24]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -537,7 +537,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #48]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[1]
+	/* Calc new W[1] */
 	LDRD	r4, r5, [sp, #120]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -581,7 +581,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #8]
-	# Round 2
+	/* Round 2 */
 	LDRD	r4, r5, [r0, #16]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -661,7 +661,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #40]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[2]
+	/* Calc new W[2] */
 	LDRD	r4, r5, [sp]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -705,7 +705,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #16]
-	# Round 3
+	/* Round 3 */
 	LDRD	r4, r5, [r0, #8]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -785,7 +785,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #32]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[3]
+	/* Calc new W[3] */
 	LDRD	r4, r5, [sp, #8]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -829,7 +829,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #24]
-	# Round 4
+	/* Round 4 */
 	LDRD	r4, r5, [r0]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -909,7 +909,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #24]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[4]
+	/* Calc new W[4] */
 	LDRD	r4, r5, [sp, #16]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -953,7 +953,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #32]
-	# Round 5
+	/* Round 5 */
 	LDRD	r4, r5, [r0, #56]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1033,7 +1033,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #16]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[5]
+	/* Calc new W[5] */
 	LDRD	r4, r5, [sp, #24]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1077,7 +1077,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #40]
-	# Round 6
+	/* Round 6 */
 	LDRD	r4, r5, [r0, #48]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1157,7 +1157,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #8]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[6]
+	/* Calc new W[6] */
 	LDRD	r4, r5, [sp, #32]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1201,7 +1201,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #48]
-	# Round 7
+	/* Round 7 */
 	LDRD	r4, r5, [r0, #40]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1281,7 +1281,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[7]
+	/* Calc new W[7] */
 	LDRD	r4, r5, [sp, #40]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1325,7 +1325,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #56]
-	# Round 8
+	/* Round 8 */
 	LDRD	r4, r5, [r0, #32]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1405,7 +1405,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #56]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[8]
+	/* Calc new W[8] */
 	LDRD	r4, r5, [sp, #48]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1449,7 +1449,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #64]
-	# Round 9
+	/* Round 9 */
 	LDRD	r4, r5, [r0, #24]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1529,7 +1529,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #48]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[9]
+	/* Calc new W[9] */
 	LDRD	r4, r5, [sp, #56]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1573,7 +1573,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #72]
-	# Round 10
+	/* Round 10 */
 	LDRD	r4, r5, [r0, #16]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1653,7 +1653,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #40]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[10]
+	/* Calc new W[10] */
 	LDRD	r4, r5, [sp, #64]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1697,7 +1697,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #80]
-	# Round 11
+	/* Round 11 */
 	LDRD	r4, r5, [r0, #8]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1777,7 +1777,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #32]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[11]
+	/* Calc new W[11] */
 	LDRD	r4, r5, [sp, #72]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1821,7 +1821,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #88]
-	# Round 12
+	/* Round 12 */
 	LDRD	r4, r5, [r0]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -1901,7 +1901,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #24]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[12]
+	/* Calc new W[12] */
 	LDRD	r4, r5, [sp, #80]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -1945,7 +1945,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #96]
-	# Round 13
+	/* Round 13 */
 	LDRD	r4, r5, [r0, #56]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2025,7 +2025,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #16]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[13]
+	/* Calc new W[13] */
 	LDRD	r4, r5, [sp, #88]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -2069,7 +2069,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #104]
-	# Round 14
+	/* Round 14 */
 	LDRD	r4, r5, [r0, #48]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2149,7 +2149,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #8]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[14]
+	/* Calc new W[14] */
 	LDRD	r4, r5, [sp, #96]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -2193,7 +2193,7 @@ L_SHA512_transform_len_start:
 	ADDS	r4, r4, r6
 	ADC	r5, r5, r7
 	STRD	r4, r5, [sp, #112]
-	# Round 15
+	/* Round 15 */
 	LDRD	r4, r5, [r0, #40]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2273,7 +2273,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Calc new W[15]
+	/* Calc new W[15] */
 	LDRD	r4, r5, [sp, #104]
 	LSRS	r6, r4, #19
 	LSRS	r7, r5, #19
@@ -2319,12 +2319,12 @@ L_SHA512_transform_len_start:
 	STRD	r4, r5, [sp, #120]
 	ADD	r3, r3, #0x80
 	SUBS	r12, r12, #0x1
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_SHA512_transform_len_start
 #else
-	BNE.N	L_SHA512_transform_len_start
+	BNE.W	L_SHA512_transform_len_start
 #endif
-	# Round 0
+	/* Round 0 */
 	LDRD	r4, r5, [r0, #32]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2404,7 +2404,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #56]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 1
+	/* Round 1 */
 	LDRD	r4, r5, [r0, #24]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2484,7 +2484,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #48]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 2
+	/* Round 2 */
 	LDRD	r4, r5, [r0, #16]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2564,7 +2564,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #40]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 3
+	/* Round 3 */
 	LDRD	r4, r5, [r0, #8]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2644,7 +2644,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #32]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 4
+	/* Round 4 */
 	LDRD	r4, r5, [r0]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2724,7 +2724,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #24]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 5
+	/* Round 5 */
 	LDRD	r4, r5, [r0, #56]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2804,7 +2804,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #16]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 6
+	/* Round 6 */
 	LDRD	r4, r5, [r0, #48]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2884,7 +2884,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #8]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 7
+	/* Round 7 */
 	LDRD	r4, r5, [r0, #40]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -2964,7 +2964,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 8
+	/* Round 8 */
 	LDRD	r4, r5, [r0, #32]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3044,7 +3044,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #56]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 9
+	/* Round 9 */
 	LDRD	r4, r5, [r0, #24]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3124,7 +3124,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #48]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 10
+	/* Round 10 */
 	LDRD	r4, r5, [r0, #16]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3204,7 +3204,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #40]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 11
+	/* Round 11 */
 	LDRD	r4, r5, [r0, #8]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3284,7 +3284,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #32]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 12
+	/* Round 12 */
 	LDRD	r4, r5, [r0]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3364,7 +3364,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #24]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 13
+	/* Round 13 */
 	LDRD	r4, r5, [r0, #56]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3444,7 +3444,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #16]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 14
+	/* Round 14 */
 	LDRD	r4, r5, [r0, #48]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3524,7 +3524,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0, #8]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Round 15
+	/* Round 15 */
 	LDRD	r4, r5, [r0, #40]
 	LSRS	r6, r4, #14
 	LSRS	r7, r5, #14
@@ -3604,7 +3604,7 @@ L_SHA512_transform_len_start:
 	STRD	r6, r7, [r0]
 	MOV	r10, r8
 	MOV	r11, r9
-	# Add in digest from start
+	/* Add in digest from start */
 	LDRD	r4, r5, [r0]
 	LDRD	r6, r7, [r0, #8]
 	LDRD	r8, r9, [sp, #128]
@@ -3656,15 +3656,15 @@ L_SHA512_transform_len_start:
 	SUBS	r2, r2, #0x80
 	SUB	r3, r3, #0x200
 	ADD	r1, r1, #0x80
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+#ifdef __GNUC__
 	BNE	L_SHA512_transform_len_begin
 #else
-	BNE.N	L_SHA512_transform_len_begin
+	BNE.W	L_SHA512_transform_len_begin
 #endif
 	EOR	r0, r0, r0
 	ADD	sp, sp, #0xc0
 	POP	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-	# Cycle Count = 5021
+	/* Cycle Count = 5021 */
 	.size	Transform_Sha512_Len,.-Transform_Sha512_Len
 #endif /* WOLFSSL_ARMASM_NO_NEON */
 #endif /* WOLFSSL_SHA512 */
diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c
index 3dc2d1f207..7521b35fa7 100644
--- a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c
+++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c
@@ -39,7 +39,7 @@
 #ifdef WOLFSSL_ARMASM_INLINE
 
 #ifdef WOLFSSL_ARMASM
-#if !defined(__aarch64__) && defined(__arm__)
+#if !defined(__aarch64__) && defined(__thumb__)
 
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
@@ -108,8 +108,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
     register wc_Sha512* sha512 __asm__ ("r0") = (wc_Sha512*)sha512_p;
     register const byte* data __asm__ ("r1") = (const byte*)data_p;
     register word32 len __asm__ ("r2") = (word32)len_p;
-#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
     register uint64_t* L_SHA512_transform_len_k_c __asm__ ("r3") = (uint64_t*)&L_SHA512_transform_len_k;
+#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0xc0\n\t"
@@ -133,7 +133,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
         "STRD	r10, r11, [sp, #184]\n\t"
         /* Start of loop processing a block */
         "\n"
-    "L_SHA512_transform_len_begin_%=:\n\t"
+    "L_SHA512_transform_len_begin:\n\t"
         /* Load, Reverse and Store W */
         "LDR	r4, [%[data]]\n\t"
         "LDR	r5, [%[data], #4]\n\t"
@@ -239,7 +239,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
         "MOV	r12, #0x4\n\t"
         /* Start of 16 rounds */
         "\n"
-    "L_SHA512_transform_len_start_%=:\n\t"
+    "L_SHA512_transform_len_start:\n\t"
         /* Round 0 */
         "LDRD	r4, r5, [%[sha512], #32]\n\t"
         "LSRS	r6, r4, #14\n\t"
@@ -2226,10 +2226,10 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
         "STRD	r4, r5, [sp, #120]\n\t"
         "ADD	r3, r3, #0x80\n\t"
         "SUBS	r12, r12, #0x1\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_SHA512_transform_len_start_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_SHA512_transform_len_start\n\t"
 #else
-        "BNE.N	L_SHA512_transform_len_start_%=\n\t"
+        "BNE.W	L_SHA512_transform_len_start\n\t"
 #endif
         /* Round 0 */
         "LDRD	r4, r5, [%[sha512], #32]\n\t"
@@ -3563,15 +3563,21 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
         "SUBS	%[len], %[len], #0x80\n\t"
         "SUB	r3, r3, #0x200\n\t"
         "ADD	%[data], %[data], #0x80\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_SHA512_transform_len_begin_%=\n\t"
+#ifdef __GNUC__
+        "BNE	L_SHA512_transform_len_begin\n\t"
 #else
-        "BNE.N	L_SHA512_transform_len_begin_%=\n\t"
+        "BNE.W	L_SHA512_transform_len_begin\n\t"
 #endif
         "EOR	r0, r0, r0\n\t"
         "ADD	sp, sp, #0xc0\n\t"
-        : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c)
+#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
+        : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len),
+          [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c)
         :
+#else
+        : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len)
+        : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k)
+#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */
         : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
     );
 }
@@ -3580,7 +3586,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len)
 #endif /* WOLFSSL_SHA512 */
 #endif /* !__aarch64__ && __thumb__ */
 #endif /* WOLFSSL_ARMASM */
-#endif /* !defined(__aarch64__) && defined(__arm__) */
+#endif /* !defined(__aarch64__) && defined(__thumb__) */
 #endif /* WOLFSSL_ARMASM */
 
 #endif /* WOLFSSL_ARMASM_INLINE */
diff --git a/wolfcrypt/src/port/caam/wolfcaam_seco.c b/wolfcrypt/src/port/caam/wolfcaam_seco.c
index dbe6db987e..8326f308f2 100644
--- a/wolfcrypt/src/port/caam/wolfcaam_seco.c
+++ b/wolfcrypt/src/port/caam/wolfcaam_seco.c
@@ -1228,7 +1228,7 @@ word32 wc_SECO_WrapKey(word32 keyId, byte* in, word32 inSz, byte* iv,
 }
 
 
-/* trasnlates the HSM error to wolfSSL error and does debug print out */
+/* Translates the HSM error to wolfSSL error and does debug print out */
 int wc_TranslateHSMError(int current, hsm_err_t err)
 {
     int ret = -1;
diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index a1ae275de6..c1c0fb8a06 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -55,6 +55,7 @@
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
 #define __volatile__   volatile
+#define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __IAR_SYSTEMS_ICC__ */
 #ifdef __KEIL__
 #define __asm__        __asm
@@ -5403,10 +5404,13 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x200\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_2048_mul_64_outer_%=: \n\t"
         "subs	r3, r5, #0xfc\n\t"
@@ -5451,13 +5455,86 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x100\n\t"
-        "beq	L_sp_2048_mul_64_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_2048_mul_64_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_2048_mul_64_inner_done_%=\n\t"
+        "blt	L_sp_2048_mul_64_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_2048_mul_64_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -5465,14 +5542,46 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x1f8\n\t"
+        "cmp	r5, #0x1f4\n\t"
         "ble	L_sp_2048_mul_64_outer_%=\n\t"
+        "ldr	lr, [%[a], #252]\n\t"
+        "ldr	r11, [%[b], #252]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_2048_mul_64_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_2048_mul_64_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -5492,10 +5601,12 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x200\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_2048_sqr_64_outer_%=: \n\t"
         "subs	r3, r5, #0xfc\n\t"
@@ -5504,8 +5615,6 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_2048_sqr_64_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_2048_sqr_64_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -5557,9 +5666,11 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_2048_sqr_64_op_done_%=\n\t"
-        "\n"
-    "L_sp_2048_sqr_64_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_2048_sqr_64_inner_done_%=\n\t"
+        "blt	L_sp_2048_sqr_64_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -5588,30 +5699,46 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_2048_sqr_64_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x100\n\t"
-        "beq	L_sp_2048_sqr_64_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_2048_sqr_64_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_2048_sqr_64_inner_%=\n\t"
-        "\n"
     "L_sp_2048_sqr_64_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x1f8\n\t"
+        "cmp	r5, #0x1f4\n\t"
         "ble	L_sp_2048_sqr_64_outer_%=\n\t"
+        "ldr	lr, [%[a], #252]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_2048_sqr_64_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_2048_sqr_64_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -5728,10 +5855,13 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x100\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_2048_mul_32_outer_%=: \n\t"
         "subs	r3, r5, #0x7c\n\t"
@@ -5776,13 +5906,86 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x80\n\t"
-        "beq	L_sp_2048_mul_32_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_2048_mul_32_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_2048_mul_32_inner_done_%=\n\t"
+        "blt	L_sp_2048_mul_32_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_2048_mul_32_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -5790,14 +5993,46 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0xf8\n\t"
+        "cmp	r5, #0xf4\n\t"
         "ble	L_sp_2048_mul_32_outer_%=\n\t"
+        "ldr	lr, [%[a], #124]\n\t"
+        "ldr	r11, [%[b], #124]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_2048_mul_32_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_2048_mul_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -5817,10 +6052,12 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x100\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_2048_sqr_32_outer_%=: \n\t"
         "subs	r3, r5, #0x7c\n\t"
@@ -5829,8 +6066,6 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_2048_sqr_32_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_2048_sqr_32_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -5882,9 +6117,11 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_2048_sqr_32_op_done_%=\n\t"
-        "\n"
-    "L_sp_2048_sqr_32_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_2048_sqr_32_inner_done_%=\n\t"
+        "blt	L_sp_2048_sqr_32_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -5913,30 +6150,46 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_2048_sqr_32_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x80\n\t"
-        "beq	L_sp_2048_sqr_32_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_2048_sqr_32_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_2048_sqr_32_inner_%=\n\t"
-        "\n"
     "L_sp_2048_sqr_32_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0xf8\n\t"
+        "cmp	r5, #0xf4\n\t"
         "ble	L_sp_2048_sqr_32_outer_%=\n\t"
+        "ldr	lr, [%[a], #124]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_2048_sqr_32_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_2048_sqr_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -28088,10 +28341,13 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x300\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_3072_mul_96_outer_%=: \n\t"
         "subs	r3, r5, #0x17c\n\t"
@@ -28136,13 +28392,86 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x180\n\t"
-        "beq	L_sp_3072_mul_96_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_3072_mul_96_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_3072_mul_96_inner_done_%=\n\t"
+        "blt	L_sp_3072_mul_96_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_3072_mul_96_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -28150,14 +28479,46 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x2f8\n\t"
+        "cmp	r5, #0x2f4\n\t"
         "ble	L_sp_3072_mul_96_outer_%=\n\t"
+        "ldr	lr, [%[a], #380]\n\t"
+        "ldr	r11, [%[b], #380]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_3072_mul_96_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_3072_mul_96_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -28177,10 +28538,12 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x300\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_3072_sqr_96_outer_%=: \n\t"
         "subs	r3, r5, #0x17c\n\t"
@@ -28189,8 +28552,6 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_3072_sqr_96_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_3072_sqr_96_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -28242,9 +28603,11 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_3072_sqr_96_op_done_%=\n\t"
-        "\n"
-    "L_sp_3072_sqr_96_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_3072_sqr_96_inner_done_%=\n\t"
+        "blt	L_sp_3072_sqr_96_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -28273,30 +28636,46 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_3072_sqr_96_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x180\n\t"
-        "beq	L_sp_3072_sqr_96_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_3072_sqr_96_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_3072_sqr_96_inner_%=\n\t"
-        "\n"
     "L_sp_3072_sqr_96_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x2f8\n\t"
+        "cmp	r5, #0x2f4\n\t"
         "ble	L_sp_3072_sqr_96_outer_%=\n\t"
+        "ldr	lr, [%[a], #380]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_3072_sqr_96_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_3072_sqr_96_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -28413,10 +28792,13 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x180\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_3072_mul_48_outer_%=: \n\t"
         "subs	r3, r5, #0xbc\n\t"
@@ -28461,13 +28843,86 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0xc0\n\t"
-        "beq	L_sp_3072_mul_48_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_3072_mul_48_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_3072_mul_48_inner_done_%=\n\t"
+        "blt	L_sp_3072_mul_48_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_3072_mul_48_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -28475,14 +28930,46 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x178\n\t"
+        "cmp	r5, #0x174\n\t"
         "ble	L_sp_3072_mul_48_outer_%=\n\t"
+        "ldr	lr, [%[a], #188]\n\t"
+        "ldr	r11, [%[b], #188]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_3072_mul_48_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_3072_mul_48_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -28502,10 +28989,12 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x180\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_3072_sqr_48_outer_%=: \n\t"
         "subs	r3, r5, #0xbc\n\t"
@@ -28514,8 +29003,6 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_3072_sqr_48_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_3072_sqr_48_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -28567,9 +29054,11 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_3072_sqr_48_op_done_%=\n\t"
-        "\n"
-    "L_sp_3072_sqr_48_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_3072_sqr_48_inner_done_%=\n\t"
+        "blt	L_sp_3072_sqr_48_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -28598,30 +29087,46 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_3072_sqr_48_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0xc0\n\t"
-        "beq	L_sp_3072_sqr_48_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_3072_sqr_48_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_3072_sqr_48_inner_%=\n\t"
-        "\n"
     "L_sp_3072_sqr_48_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x178\n\t"
+        "cmp	r5, #0x174\n\t"
         "ble	L_sp_3072_sqr_48_outer_%=\n\t"
+        "ldr	lr, [%[a], #188]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_3072_sqr_48_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_3072_sqr_48_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -46058,10 +46563,13 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x400\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_4096_mul_128_outer_%=: \n\t"
         "subs	r3, r5, #0x1fc\n\t"
@@ -46106,13 +46614,86 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x200\n\t"
-        "beq	L_sp_4096_mul_128_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_4096_mul_128_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_4096_mul_128_inner_done_%=\n\t"
+        "blt	L_sp_4096_mul_128_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_4096_mul_128_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -46120,14 +46701,46 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit*
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x3f8\n\t"
+        "cmp	r5, #0x3f4\n\t"
         "ble	L_sp_4096_mul_128_outer_%=\n\t"
+        "ldr	lr, [%[a], #508]\n\t"
+        "ldr	r11, [%[b], #508]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_4096_mul_128_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_4096_mul_128_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -46147,10 +46760,12 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x400\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_4096_sqr_128_outer_%=: \n\t"
         "subs	r3, r5, #0x1fc\n\t"
@@ -46159,8 +46774,6 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_4096_sqr_128_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_4096_sqr_128_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -46212,9 +46825,11 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_4096_sqr_128_op_done_%=\n\t"
-        "\n"
-    "L_sp_4096_sqr_128_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_4096_sqr_128_inner_done_%=\n\t"
+        "blt	L_sp_4096_sqr_128_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -46243,30 +46858,46 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_4096_sqr_128_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x200\n\t"
-        "beq	L_sp_4096_sqr_128_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_4096_sqr_128_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_4096_sqr_128_inner_%=\n\t"
-        "\n"
     "L_sp_4096_sqr_128_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x3f8\n\t"
+        "cmp	r5, #0x3f4\n\t"
         "ble	L_sp_4096_sqr_128_outer_%=\n\t"
+        "ldr	lr, [%[a], #508]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_4096_sqr_128_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_4096_sqr_128_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -60831,10 +61462,13 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x40\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_256_mul_8_outer_%=: \n\t"
         "subs	r3, r5, #28\n\t"
@@ -60879,13 +61513,86 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #32\n\t"
-        "beq	L_sp_256_mul_8_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_256_mul_8_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_256_mul_8_inner_done_%=\n\t"
+        "blt	L_sp_256_mul_8_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_256_mul_8_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -60893,14 +61600,46 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #56\n\t"
+        "cmp	r5, #52\n\t"
         "ble	L_sp_256_mul_8_outer_%=\n\t"
+        "ldr	lr, [%[a], #28]\n\t"
+        "ldr	r11, [%[b], #28]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_256_mul_8_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_256_mul_8_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -63403,10 +64142,12 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x40\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_256_sqr_8_outer_%=: \n\t"
         "subs	r3, r5, #28\n\t"
@@ -63415,8 +64156,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_256_sqr_8_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_256_sqr_8_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -63468,9 +64207,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_256_sqr_8_op_done_%=\n\t"
-        "\n"
-    "L_sp_256_sqr_8_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_256_sqr_8_inner_done_%=\n\t"
+        "blt	L_sp_256_sqr_8_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -63499,30 +64240,46 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_256_sqr_8_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #32\n\t"
-        "beq	L_sp_256_sqr_8_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_256_sqr_8_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_256_sqr_8_inner_%=\n\t"
-        "\n"
     "L_sp_256_sqr_8_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #56\n\t"
+        "cmp	r5, #52\n\t"
         "ble	L_sp_256_sqr_8_outer_%=\n\t"
+        "ldr	lr, [%[a], #28]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_256_sqr_8_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_256_sqr_8_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -70275,8 +71032,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_8(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_8(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_8(r, a, m, mp);
     for (; n > 1; n--) {
@@ -78372,7 +79129,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -78428,7 +79185,7 @@ static int sp_256_ecc_is_point_8(const sp_point_256* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -79028,10 +79785,13 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x60\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_384_mul_12_outer_%=: \n\t"
         "subs	r3, r5, #44\n\t"
@@ -79076,13 +79836,86 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #48\n\t"
-        "beq	L_sp_384_mul_12_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_384_mul_12_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_384_mul_12_inner_done_%=\n\t"
+        "blt	L_sp_384_mul_12_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_384_mul_12_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -79090,14 +79923,46 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x58\n\t"
+        "cmp	r5, #0x54\n\t"
         "ble	L_sp_384_mul_12_outer_%=\n\t"
+        "ldr	lr, [%[a], #44]\n\t"
+        "ldr	r11, [%[b], #44]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_384_mul_12_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_384_mul_12_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -84616,10 +85481,12 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x60\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_384_sqr_12_outer_%=: \n\t"
         "subs	r3, r5, #44\n\t"
@@ -84628,8 +85495,6 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_384_sqr_12_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_384_sqr_12_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -84681,9 +85546,11 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_384_sqr_12_op_done_%=\n\t"
-        "\n"
-    "L_sp_384_sqr_12_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_384_sqr_12_inner_done_%=\n\t"
+        "blt	L_sp_384_sqr_12_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -84712,30 +85579,46 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_384_sqr_12_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #48\n\t"
-        "beq	L_sp_384_sqr_12_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_384_sqr_12_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_384_sqr_12_inner_%=\n\t"
-        "\n"
     "L_sp_384_sqr_12_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x58\n\t"
+        "cmp	r5, #0x54\n\t"
         "ble	L_sp_384_sqr_12_outer_%=\n\t"
+        "ldr	lr, [%[a], #44]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_384_sqr_12_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_384_sqr_12_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -88978,8 +89861,8 @@ SP_NOINLINE static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_12(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_12(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_12(r, a, m, mp);
     for (; n > 1; n--) {
@@ -96322,7 +97205,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -96378,7 +97261,7 @@ static int sp_384_ecc_is_point_12(const sp_point_384* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -97020,10 +97903,13 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x88\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_521_mul_17_outer_%=: \n\t"
         "subs	r3, r5, #0x40\n\t"
@@ -97068,13 +97954,86 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x44\n\t"
-        "beq	L_sp_521_mul_17_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_521_mul_17_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_521_mul_17_inner_done_%=\n\t"
+        "blt	L_sp_521_mul_17_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_521_mul_17_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -97082,17 +98041,49 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x80\n\t"
+        "cmp	r5, #0x7c\n\t"
         "ble	L_sp_521_mul_17_outer_%=\n\t"
+        "ldr	lr, [%[a], #64]\n\t"
+        "ldr	r11, [%[b], #64]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "ldm	sp!, {r6, r7}\n\t"
         "stm	%[r]!, {r6, r7}\n\t"
         "sub	r5, r5, #8\n\t"
         "\n"
     "L_sp_521_mul_17_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_521_mul_17_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -108130,10 +109121,12 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x88\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_521_sqr_17_outer_%=: \n\t"
         "subs	r3, r5, #0x40\n\t"
@@ -108142,8 +109135,6 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_521_sqr_17_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_521_sqr_17_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -108195,9 +109186,11 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_521_sqr_17_op_done_%=\n\t"
-        "\n"
-    "L_sp_521_sqr_17_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_521_sqr_17_inner_done_%=\n\t"
+        "blt	L_sp_521_sqr_17_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -108226,33 +109219,49 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_521_sqr_17_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x44\n\t"
-        "beq	L_sp_521_sqr_17_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_521_sqr_17_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_521_sqr_17_inner_%=\n\t"
-        "\n"
     "L_sp_521_sqr_17_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0x80\n\t"
+        "cmp	r5, #0x7c\n\t"
         "ble	L_sp_521_sqr_17_outer_%=\n\t"
+        "ldr	lr, [%[a], #64]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "ldm	sp!, {r6, r7}\n\t"
         "stm	%[r]!, {r6, r7}\n\t"
         "sub	r5, r5, #8\n\t"
         "\n"
     "L_sp_521_sqr_17_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_521_sqr_17_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -115841,8 +116850,8 @@ SP_NOINLINE static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_17(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_17(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_17(r, a, m, mp);
     for (; n > 1; n--) {
@@ -125146,7 +126155,7 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -125202,7 +126211,7 @@ static int sp_521_ecc_is_point_17(const sp_point_521* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -141063,10 +142072,13 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x100\n\t"
-        "mov	r5, #0\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "ldr	r11, [%[b]]\n\t"
+        "umull	r8, r6, lr, r11\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_1024_mul_32_outer_%=: \n\t"
         "subs	r3, r5, #0x7c\n\t"
@@ -141111,13 +142123,86 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "adds	r6, r6, r9\n\t"
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
+#endif
+        "ldr	lr, [%[a], r4]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
 #endif
         "add	r3, r3, #4\n\t"
         "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x80\n\t"
-        "beq	L_sp_1024_mul_32_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_1024_mul_32_inner_%=\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_1024_mul_32_inner_done_%=\n\t"
+        "blt	L_sp_1024_mul_32_inner_%=\n\t"
+        "ldr	lr, [%[a], r3]\n\t"
+        "ldr	r11, [%[b], r3]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adcs	r7, r7, #0\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#else
+        "umull	r9, r10, lr, r11\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adcs	r7, r7, r10\n\t"
+        "adc	r8, r8, #0\n\t"
+#endif
         "\n"
     "L_sp_1024_mul_32_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
@@ -141125,14 +142210,46 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0xf8\n\t"
+        "cmp	r5, #0xf4\n\t"
         "ble	L_sp_1024_mul_32_outer_%=\n\t"
+        "ldr	lr, [%[a], #124]\n\t"
+        "ldr	r11, [%[b], #124]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "adds	r6, r6, r10\n\t"
+        "adc	r7, r7, #0\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r9, lr, #16\n\t"
+        "lsr	r10, r11, #16\n\t"
+        "mul	r10, r9, r10\n\t"
+        "add	r7, r7, r10\n\t"
+        "lsl	r10, r11, #16\n\t"
+        "lsr	r10, r10, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #16\n\t"
+        "lsl	r9, r9, #16\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umlal	r6, r7, lr, r11\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_1024_mul_32_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_1024_mul_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -141152,10 +142269,12 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p)
 
     __asm__ __volatile__ (
         "sub	sp, sp, #0x100\n\t"
-        "mov	r6, #0\n\t"
+        "ldr	lr, [%[a]]\n\t"
+        "umull	r8, r6, lr, lr\n\t"
+        "str	r8, [sp]\n\t"
         "mov	r7, #0\n\t"
         "mov	r8, #0\n\t"
-        "mov	r5, #0\n\t"
+        "mov	r5, #4\n\t"
         "\n"
     "L_sp_1024_sqr_32_outer_%=: \n\t"
         "subs	r3, r5, #0x7c\n\t"
@@ -141164,8 +142283,6 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "sub	r4, r5, r3\n\t"
         "\n"
     "L_sp_1024_sqr_32_inner_%=: \n\t"
-        "cmp	r4, r3\n\t"
-        "beq	L_sp_1024_sqr_32_op_sqr_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
         "ldr	r11, [%[a], r4]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
@@ -141217,9 +142334,11 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "adcs	r7, r7, r10\n\t"
         "adc	r8, r8, #0\n\t"
 #endif
-        "bal	L_sp_1024_sqr_32_op_done_%=\n\t"
-        "\n"
-    "L_sp_1024_sqr_32_op_sqr_%=: \n\t"
+        "add	r3, r3, #4\n\t"
+        "sub	r4, r4, #4\n\t"
+        "cmp	r3, r4\n\t"
+        "bgt	L_sp_1024_sqr_32_inner_done_%=\n\t"
+        "blt	L_sp_1024_sqr_32_inner_%=\n\t"
         "ldr	lr, [%[a], r3]\n\t"
 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
         "lsl	r9, lr, #16\n\t"
@@ -141248,30 +142367,46 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p)
         "adc	r8, r8, #0\n\t"
 #endif
         "\n"
-    "L_sp_1024_sqr_32_op_done_%=: \n\t"
-        "add	r3, r3, #4\n\t"
-        "sub	r4, r4, #4\n\t"
-        "cmp	r3, #0x80\n\t"
-        "beq	L_sp_1024_sqr_32_inner_done_%=\n\t"
-        "cmp	r3, r4\n\t"
-        "bgt	L_sp_1024_sqr_32_inner_done_%=\n\t"
-        "cmp	r3, r5\n\t"
-        "ble	L_sp_1024_sqr_32_inner_%=\n\t"
-        "\n"
     "L_sp_1024_sqr_32_inner_done_%=: \n\t"
         "str	r6, [sp, r5]\n\t"
         "mov	r6, r7\n\t"
         "mov	r7, r8\n\t"
         "mov	r8, #0\n\t"
         "add	r5, r5, #4\n\t"
-        "cmp	r5, #0xf8\n\t"
+        "cmp	r5, #0xf4\n\t"
         "ble	L_sp_1024_sqr_32_outer_%=\n\t"
+        "ldr	lr, [%[a], #124]\n\t"
+#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mov	r11, r9\n\t"
+        "mul	r9, r11, r9\n\t"
+        "mov	r11, r10\n\t"
+        "mul	r10, r11, r10\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+        "lsr	r10, lr, #16\n\t"
+        "lsl	r9, lr, #16\n\t"
+        "lsr	r9, r9, #16\n\t"
+        "mul	r9, r10, r9\n\t"
+        "lsr	r10, r9, #15\n\t"
+        "lsl	r9, r9, #17\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#else
+        "umull	r9, r10, lr, lr\n\t"
+        "adds	r6, r6, r9\n\t"
+        "adc	r7, r7, r10\n\t"
+#endif
         "str	r6, [sp, r5]\n\t"
+        "add	r5, r5, #4\n\t"
+        "str	r7, [sp, r5]\n\t"
         "\n"
     "L_sp_1024_sqr_32_store_%=: \n\t"
-        "ldm	sp!, {r6, r7, r8, r9}\n\t"
-        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
-        "subs	r5, r5, #16\n\t"
+        "ldm	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "stm	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "subs	r5, r5, #32\n\t"
         "bgt	L_sp_1024_sqr_32_store_%=\n\t"
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -155455,7 +156590,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
     }
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -155515,7 +156650,7 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c
index 2ba0058e93..ed66e6d198 100644
--- a/wolfcrypt/src/sp_arm64.c
+++ b/wolfcrypt/src/sp_arm64.c
@@ -55,6 +55,7 @@
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
 #define __volatile__   volatile
+#define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __IAR_SYSTEMS_ICC__ */
 #ifdef __KEIL__
 #define __asm__        __asm
@@ -98,7 +99,7 @@ static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
         "subs	x6, %[n], 8\n\t"
         "mov	x7, xzr\n\t"
         "blt	2f\n\t"
-        /* Put in mulitples of 8 bytes. */
+        /* Put in multiples of 8 bytes. */
         "1:\n\t"
         "ldr	x8, [x4], -8\n\t"
         "subs	x6, x6, 8\n\t"
@@ -6991,7 +6992,7 @@ static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
         "subs	x6, %[n], 8\n\t"
         "mov	x7, xzr\n\t"
         "blt	2f\n\t"
-        /* Put in mulitples of 8 bytes. */
+        /* Put in multiples of 8 bytes. */
         "1:\n\t"
         "ldr	x8, [x4], -8\n\t"
         "subs	x6, x6, 8\n\t"
@@ -16594,7 +16595,7 @@ static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
         "subs	x6, %[n], 8\n\t"
         "mov	x7, xzr\n\t"
         "blt	2f\n\t"
-        /* Put in mulitples of 8 bytes. */
+        /* Put in multiples of 8 bytes. */
         "1:\n\t"
         "ldr	x8, [x4], -8\n\t"
         "subs	x6, x6, 8\n\t"
@@ -21871,7 +21872,8 @@ static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * a  A single precision integer.
  * b  A single precision integer.
  */
-static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_256_mul_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* b)
 {
     __asm__ __volatile__ (
         "ldp	x13, x14, [%[a], 0]\n\t"
@@ -21977,7 +21979,7 @@ static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b)
  * r  A single precision integer.
  * a  A single precision integer.
  */
-static void sp_256_sqr_4(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_256_sqr_4(sp_digit* r, const sp_digit* a)
 {
     __asm__ __volatile__ (
         "ldp	x12, x13, [%[a], 0]\n\t"
@@ -22420,8 +22422,8 @@ static void sp_256_cond_copy_4(sp_digit* r, const sp_digit* a, sp_digit m)
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m, sp_digit mp)
 {
     (void)m;
     (void)mp;
@@ -22595,8 +22597,8 @@ static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const sp_digit* m,
-        sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* m, sp_digit mp)
 {
     (void)m;
     (void)mp;
@@ -22740,8 +22742,8 @@ static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const sp_digit* m,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_4(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_4(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_4(r, a, m, mp);
     for (; n > 1; n--) {
@@ -23080,7 +23082,8 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_256_mont_dbl_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x3, x4, [%[a]]\n\t"
@@ -23120,7 +23123,8 @@ static void sp_256_mont_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_tpl_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_256_mont_tpl_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x9, x10, [%[a]]\n\t"
@@ -23175,8 +23179,8 @@ static void sp_256_mont_tpl_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x4, x5, [%[a], 0]\n\t"
@@ -23217,7 +23221,8 @@ static void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_256_mont_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_256_mont_div2_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp   x3, x4, [%[a], 0]\n\t"
@@ -23249,8 +23254,8 @@ static void sp_256_mont_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a,
-        sp_digit* b, const sp_digit* m)
+SP_NOINLINE static void sp_256_mont_rsb_sub_dbl_4(sp_digit* r,
+    const sp_digit* a, sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x8, x9, [%[b]]\n\t"
@@ -23326,8 +23331,8 @@ static void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a,
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_add_sub_4(sp_digit* ra, sp_digit* rs, const sp_digit* a,
-        const sp_digit* b, const sp_digit* m)
+SP_NOINLINE static void sp_256_mont_add_sub_4(sp_digit* ra,
+    sp_digit* rs, const sp_digit* a, const sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x4, x5, [%[a], 0]\n\t"
@@ -23806,7 +23811,8 @@ static void sp_256_proj_point_add_4(sp_point_256* r,
         : [r] "r" (r), [p] "r" (p), [q] "r" (q), [x] "r" (x),
           [y] "r" (y), [z] "r" (z)
         : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
-          "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+          "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28",
+          "cc"
     );
 }
     }
@@ -24038,7 +24044,8 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         : [r] "r" (r), [p] "r" (p), [q] "r" (q), [x] "r" (ctx->x),
           [y] "r" (ctx->y), [z] "r" (ctx->z)
         : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
-          "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+          "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28",
+          "cc"
     );
 }
         ctx->state = 25;
@@ -24281,8 +24288,8 @@ static void sp_256_ecc_recode_6_4(const sp_digit* k, ecc_recode_256* v)
  * table  Table - start of the entries to access
  * idx    Index of entry to retrieve.
  */
-static void sp_256_get_point_33_4(sp_point_256* r, const sp_point_256* table,
-    int idx)
+SP_NOINLINE static void sp_256_get_point_33_4(sp_point_256* r,
+    const sp_point_256* table, int idx)
 {
     __asm__ __volatile__ (
         "mov	w30, #1\n\t"
@@ -24339,7 +24346,7 @@ static void sp_256_get_point_33_4(sp_point_256* r, const sp_point_256* table,
         "stp	x13, x14, [%[r], #144]\n\t"
        : [table] "+r" (table)
        : [r] "r" (r), [idx] "r" (idx)
-       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30"
+       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30", "cc"
     );
 }
 #endif /* !WC_NO_CACHE_RESISTANT */
@@ -24608,7 +24615,8 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
         : [r] "r" (r), [p] "r" (p), [q] "r" (q), [x] "r" (x),
           [y] "r" (y), [z] "r" (z)
         : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
-          "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+          "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28",
+          "cc"
     );
 }
     }
@@ -24739,7 +24747,7 @@ static int sp_256_gen_stripe_table_4(const sp_point_256* a,
  * table  Table - start of the entries to access
  * idx    Index of entry to retrieve.
  */
-static void sp_256_get_entry_64_4(sp_point_256* r,
+SP_NOINLINE static void sp_256_get_entry_64_4(sp_point_256* r,
     const sp_table_entry_256* table, int idx)
 {
     __asm__ __volatile__ (
@@ -24783,7 +24791,7 @@ static void sp_256_get_entry_64_4(sp_point_256* r,
         "stp	x9, x10, [%[r], #80]\n\t"
        : [table] "+r" (table)
        : [r] "r" (r), [idx] "r" (idx)
-       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30"
+       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30", "cc"
     );
 }
 #endif /* !WC_NO_CACHE_RESISTANT */
@@ -25168,7 +25176,7 @@ static int sp_256_gen_stripe_table_4(const sp_point_256* a,
  * table  Table - start of the entries to access
  * idx    Index of entry to retrieve.
  */
-static void sp_256_get_entry_256_4(sp_point_256* r,
+SP_NOINLINE static void sp_256_get_entry_256_4(sp_point_256* r,
     const sp_table_entry_256* table, int idx)
 {
     __asm__ __volatile__ (
@@ -25212,7 +25220,7 @@ static void sp_256_get_entry_256_4(sp_point_256* r,
         "stp	x9, x10, [%[r], #80]\n\t"
        : [table] "+r" (table)
        : [r] "r" (r), [idx] "r" (idx)
-       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30"
+       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30", "cc"
     );
 }
 #endif /* !WC_NO_CACHE_RESISTANT */
@@ -27367,7 +27375,7 @@ static void sp_256_ecc_recode_7_4(const sp_digit* k, ecc_recode_256* v)
  * table  Table - start of the entries to access
  * idx    Index of entry to retrieve.
  */
-static void sp_256_get_entry_65_4(sp_point_256* r,
+SP_NOINLINE static void sp_256_get_entry_65_4(sp_point_256* r,
     const sp_table_entry_256* table, int idx)
 {
     __asm__ __volatile__ (
@@ -27411,7 +27419,7 @@ static void sp_256_get_entry_65_4(sp_point_256* r,
         "stp	x9, x10, [%[r], #80]\n\t"
        : [table] "+r" (table)
        : [r] "r" (r), [idx] "r" (idx)
-       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30"
+       : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30", "cc"
     );
 }
 #endif /* !WC_NO_CACHE_RESISTANT */
@@ -39677,7 +39685,7 @@ static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n)
         "subs	x6, %[n], 8\n\t"
         "mov	x7, xzr\n\t"
         "blt	2f\n\t"
-        /* Put in mulitples of 8 bytes. */
+        /* Put in multiples of 8 bytes. */
         "1:\n\t"
         "ldr	x8, [x4], -8\n\t"
         "subs	x6, x6, 8\n\t"
@@ -40320,7 +40328,8 @@ static WC_INLINE int sp_256_mod_4(sp_digit* r, const sp_digit* a, const sp_digit
  * a  First operand of the multiplication.
  * b  Second operand of the multiplication.
  */
-static void sp_256_mont_mul_order_4(sp_digit* r, const sp_digit* a, const sp_digit* b)
+SP_NOINLINE static void sp_256_mont_mul_order_4(sp_digit* r,
+    const sp_digit* a, const sp_digit* b)
 {
     __asm__ __volatile__ (
         "ldp	x13, x14, [%[a], 0]\n\t"
@@ -40545,7 +40554,8 @@ static const uint64_t p256_order_minus_2[4] = {
  * r  Result of the squaring.
  * a  Number to square.
  */
-static void sp_256_mont_sqr_order_4(sp_digit* r, const sp_digit* a)
+SP_NOINLINE static void sp_256_mont_sqr_order_4(sp_digit* r,
+    const sp_digit* a)
 {
     __asm__ __volatile__ (
         "ldp	x12, x13, [%[a], 0]\n\t"
@@ -40731,7 +40741,8 @@ static void sp_256_mont_sqr_order_4(sp_digit* r, const sp_digit* a)
  * r  Result of the squaring.
  * a  Number to square.
  */
-static void sp_256_mont_sqr_n_order_4(sp_digit* r, const sp_digit* a, int n)
+SP_NOINLINE static void sp_256_mont_sqr_n_order_4(sp_digit* r,
+    const sp_digit* a, int n)
 {
 
     __asm__ __volatile__ (
@@ -42079,8 +42090,8 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_256_mont_add_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_256_mont_add_4(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x4, x5, [%[a], 0]\n\t"
@@ -42116,7 +42127,7 @@ static void sp_256_mont_add_4(sp_digit* r, const sp_digit* a, const sp_digit* b,
     (void)m;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -42172,7 +42183,7 @@ static int sp_256_ecc_is_point_4(const sp_point_256* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -43960,8 +43971,8 @@ SP_NOINLINE static void sp_384_mont_sqr_6(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_6(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_6(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_6(r, a, m, mp);
     for (; n > 1; n--) {
@@ -44249,8 +44260,8 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_384_mont_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_384_mont_add_6(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     sp_digit o;
 
@@ -44264,7 +44275,8 @@ static void sp_384_mont_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b,
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_384_mont_dbl_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_384_mont_dbl_6(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     sp_digit o;
 
@@ -44278,7 +44290,8 @@ static void sp_384_mont_dbl_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_384_mont_tpl_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_384_mont_tpl_6(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     sp_digit o;
 
@@ -44376,8 +44389,8 @@ static sp_digit sp_384_cond_add_6(sp_digit* r, const sp_digit* a, const sp_digit
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     sp_digit o;
 
@@ -44412,7 +44425,8 @@ static void sp_384_rshift1_6(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_384_mont_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_384_mont_div2_6(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     sp_digit o;
 
@@ -45259,8 +45273,8 @@ static void sp_384_ecc_recode_6_6(const sp_digit* k, ecc_recode_384* v)
  * table  Table - start of the entries to access
  * idx    Index of entry to retrieve.
  */
-static void sp_384_get_point_33_6(sp_point_384* r, const sp_point_384* table,
-    int idx)
+SP_NOINLINE static void sp_384_get_point_33_6(sp_point_384* r,
+    const sp_point_384* table, int idx)
 {
     int i;
     sp_digit mask;
@@ -66438,7 +66452,7 @@ static void sp_384_from_bin(sp_digit* r, int size, const byte* a, int n)
         "subs	x6, %[n], 8\n\t"
         "mov	x7, xzr\n\t"
         "blt	2f\n\t"
-        /* Put in mulitples of 8 bytes. */
+        /* Put in multiples of 8 bytes. */
         "1:\n\t"
         "ldr	x8, [x4], -8\n\t"
         "subs	x6, x6, 8\n\t"
@@ -68193,7 +68207,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -68249,7 +68263,7 @@ static int sp_384_ecc_is_point_6(const sp_point_384* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -72147,8 +72161,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_9(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_9(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_9(r, a, m, mp);
     for (; n > 1; n--) {
@@ -72447,8 +72461,8 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_521_mont_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_521_mont_add_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x4, x5, [%[a], 0]\n\t"
@@ -72500,7 +72514,8 @@ static void sp_521_mont_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_521_mont_dbl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_521_mont_dbl_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x4, x5, [%[a], 0]\n\t"
@@ -72547,7 +72562,8 @@ static void sp_521_mont_dbl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_521_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_521_mont_tpl_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x4, x5, [%[a], 0]\n\t"
@@ -72604,8 +72620,8 @@ static void sp_521_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp	x4, x5, [%[a], 0]\n\t"
@@ -72786,7 +72802,8 @@ static void sp_521_rshift1_9(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_521_mont_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_521_mont_div2_9(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     sp_digit o;
 
@@ -73635,8 +73652,8 @@ static void sp_521_ecc_recode_6_9(const sp_digit* k, ecc_recode_521* v)
  * table  Table - start of the entries to access
  * idx    Index of entry to retrieve.
  */
-static void sp_521_get_point_33_9(sp_point_521* r, const sp_point_521* table,
-    int idx)
+SP_NOINLINE static void sp_521_get_point_33_9(sp_point_521* r,
+    const sp_point_521* table, int idx)
 {
     int i;
     sp_digit mask;
@@ -111599,7 +111616,7 @@ static void sp_521_from_bin(sp_digit* r, int size, const byte* a, int n)
         "subs	x6, %[n], 8\n\t"
         "mov	x7, xzr\n\t"
         "blt	2f\n\t"
-        /* Put in mulitples of 8 bytes. */
+        /* Put in multiples of 8 bytes. */
         "1:\n\t"
         "ldr	x8, [x4], -8\n\t"
         "subs	x6, x6, 8\n\t"
@@ -113134,7 +113151,7 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -113190,7 +113207,7 @@ static int sp_521_ecc_is_point_9(const sp_point_521* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -116257,8 +116274,8 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
  * b   Second number to add in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_1024_mont_add_16(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp       x4, x5, [%[a], 0]\n\t"
@@ -116358,7 +116375,8 @@ static void sp_1024_mont_add_16(sp_digit* r, const sp_digit* a, const sp_digit*
  * a   Number to double in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_dbl_16(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_1024_mont_dbl_16(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp       x4, x5, [%[a], 0]\n\t"
@@ -116450,7 +116468,8 @@ static void sp_1024_mont_dbl_16(sp_digit* r, const sp_digit* a, const sp_digit*
  * a   Number to triple in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_tpl_16(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_1024_mont_tpl_16(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp       x4, x5, [%[a], 0]\n\t"
@@ -116612,8 +116631,8 @@ static void sp_1024_mont_tpl_16(sp_digit* r, const sp_digit* a, const sp_digit*
  * b   Number to subtract with in Montgomery form.
  * m   Modulus (prime).
  */
-static void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b,
-        const sp_digit* m)
+SP_NOINLINE static void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a,
+    const sp_digit* b, const sp_digit* m)
 {
     __asm__ __volatile__ (
         "ldp       x4, x5, [%[a], 0]\n\t"
@@ -116881,7 +116900,8 @@ static void sp_1024_rshift1_16(sp_digit* r, const sp_digit* a)
  * a  Number to divide.
  * m  Modulus (prime).
  */
-static void sp_1024_mont_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m)
+SP_NOINLINE static void sp_1024_mont_div2_16(sp_digit* r, const sp_digit* a,
+    const sp_digit* m)
 {
     sp_digit o;
 
@@ -125220,7 +125240,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
         "subs	x6, %[n], 8\n\t"
         "mov	x7, xzr\n\t"
         "blt	2f\n\t"
-        /* Put in mulitples of 8 bytes. */
+        /* Put in multiples of 8 bytes. */
         "1:\n\t"
         "ldr	x8, [x4], -8\n\t"
         "subs	x6, x6, 8\n\t"
@@ -125314,7 +125334,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
     );
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -125374,7 +125394,7 @@ static int sp_1024_ecc_is_point_16(const sp_point_1024* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c
index 1873ef373b..98a338b686 100644
--- a/wolfcrypt/src/sp_armthumb.c
+++ b/wolfcrypt/src/sp_armthumb.c
@@ -55,6 +55,7 @@
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
 #define __volatile__   volatile
+#define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __IAR_SYSTEMS_ICC__ */
 #ifdef __KEIL__
 #define __asm__        __asm
@@ -98861,8 +98862,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_8(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_8(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_8(r, a, m, mp);
     for (; n > 1; n--) {
@@ -107673,7 +107674,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -107729,7 +107730,7 @@ static int sp_256_ecc_is_point_8(const sp_point_256* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -110309,8 +110310,8 @@ SP_NOINLINE static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_12(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_12(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_12(r, a, m, mp);
     for (; n > 1; n--) {
@@ -118869,7 +118870,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -118925,7 +118926,7 @@ static int sp_384_ecc_is_point_12(const sp_point_384* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -122593,8 +122594,8 @@ SP_NOINLINE static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_17(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_17(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_17(r, a, m, mp);
     for (; n > 1; n--) {
@@ -135811,7 +135812,7 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -135867,7 +135868,7 @@ static int sp_521_ecc_is_point_17(const sp_point_521* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -218580,7 +218581,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
     }
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -218640,7 +218641,7 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c
index 37a7ea28dc..468e0fcfef 100644
--- a/wolfcrypt/src/sp_c32.c
+++ b/wolfcrypt/src/sp_c32.c
@@ -59,6 +59,7 @@
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
 #define __volatile__   volatile
+#define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __IAR_SYSTEMS_ICC__ */
 #ifdef __KEIL__
 #define __asm__        __asm
@@ -20987,8 +20988,8 @@ SP_NOINLINE static void sp_256_mont_sqr_9(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_9(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_9(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_9(r, a, m, mp);
     for (; n > 1; n--) {
@@ -26540,7 +26541,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -26596,7 +26597,7 @@ static int sp_256_ecc_is_point_9(const sp_point_256* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -28395,8 +28396,8 @@ SP_NOINLINE static void sp_384_mont_sqr_15(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_15(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_15(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_15(r, a, m, mp);
     for (; n > 1; n--) {
@@ -34621,7 +34622,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -34677,7 +34678,7 @@ static int sp_384_ecc_is_point_15(const sp_point_384* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -36068,8 +36069,8 @@ SP_NOINLINE static void sp_521_mont_sqr_21(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_21(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_21(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_21(r, a, m, mp);
     for (; n > 1; n--) {
@@ -42795,7 +42796,7 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -42851,7 +42852,7 @@ static int sp_521_ecc_is_point_21(const sp_point_521* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -54684,7 +54685,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
     }
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -54744,7 +54745,7 @@ static int sp_1024_ecc_is_point_42(const sp_point_1024* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c
index d1888cd474..a2b97d8169 100644
--- a/wolfcrypt/src/sp_c64.c
+++ b/wolfcrypt/src/sp_c64.c
@@ -59,6 +59,7 @@
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
 #define __volatile__   volatile
+#define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __IAR_SYSTEMS_ICC__ */
 #ifdef __KEIL__
 #define __asm__        __asm
@@ -21967,8 +21968,8 @@ SP_NOINLINE static void sp_256_mont_sqr_5(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_5(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_5(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_5(r, a, m, mp);
     for (; n > 1; n--) {
@@ -27434,7 +27435,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -27490,7 +27491,7 @@ static int sp_256_ecc_is_point_5(const sp_point_256* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -28871,8 +28872,8 @@ SP_NOINLINE static void sp_384_mont_sqr_7(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_7(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_7(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_7(r, a, m, mp);
     for (; n > 1; n--) {
@@ -34902,7 +34903,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -34958,7 +34959,7 @@ static int sp_384_ecc_is_point_7(const sp_point_384* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -36409,8 +36410,8 @@ SP_NOINLINE static void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_9(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_9(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_9(r, a, m, mp);
     for (; n > 1; n--) {
@@ -42386,7 +42387,7 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -42442,7 +42443,7 @@ static int sp_521_ecc_is_point_9(const sp_point_521* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -53268,7 +53269,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
     }
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -53328,7 +53329,7 @@ static int sp_1024_ecc_is_point_18(const sp_point_1024* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c
index 48263c751d..285f4eb3d5 100644
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -55,6 +55,7 @@
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
 #define __volatile__   volatile
+#define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __IAR_SYSTEMS_ICC__ */
 #ifdef __KEIL__
 #define __asm__        __asm
@@ -2222,7 +2223,7 @@ static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x100\n\t"
         "\n"
-    "L_sp_2048_add_64_word_%=:\n\t"
+    "L_sp_2048_add_64_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -2235,9 +2236,9 @@ static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_2048_add_64_word_%=\n\t"
+        "BNE	L_sp_2048_add_64_word\n\t"
 #else
-        "BNE.N	L_sp_2048_add_64_word_%=\n\t"
+        "BNE.N	L_sp_2048_add_64_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -2269,7 +2270,7 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x100\n\t"
         "\n"
-    "L_sp_2048_sub_in_pkace_64_word_%=:\n\t"
+    "L_sp_2048_sub_in_pkace_64_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -2281,9 +2282,9 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_2048_sub_in_pkace_64_word_%=\n\t"
+        "BNE	L_sp_2048_sub_in_pkace_64_word\n\t"
 #else
-        "BNE.N	L_sp_2048_sub_in_pkace_64_word_%=\n\t"
+        "BNE.N	L_sp_2048_sub_in_pkace_64_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -2315,61 +2316,80 @@ static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x200\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_2048_mul_64_outer_%=:\n\t"
+    "L_sp_2048_mul_64_outer:\n\t"
         "SUBS	r3, r5, #0xfc\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_mul_64_inner_%=:\n\t"
+    "L_sp_2048_mul_64_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x100\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_2048_mul_64_inner_done_%=\n\t"
+        "BGT	L_sp_2048_mul_64_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_2048_mul_64_inner_done_%=\n\t"
+        "BGT.N	L_sp_2048_mul_64_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_mul_64_inner_%=\n\t"
+        "BLT	L_sp_2048_mul_64_inner\n\t"
 #else
-        "BLE.N	L_sp_2048_mul_64_inner_%=\n\t"
+        "BLT.N	L_sp_2048_mul_64_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_2048_mul_64_inner_done_%=:\n\t"
+    "L_sp_2048_mul_64_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x1f8\n\t"
+        "CMP	r5, #0x1f4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_mul_64_outer_%=\n\t"
+        "BLE	L_sp_2048_mul_64_outer\n\t"
 #else
-        "BLE.N	L_sp_2048_mul_64_outer_%=\n\t"
+        "BLE.N	L_sp_2048_mul_64_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #252]\n\t"
+        "LDR	r11, [%[b], #252]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_mul_64_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_2048_mul_64_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_2048_mul_64_store_%=\n\t"
+        "BGT	L_sp_2048_mul_64_store\n\t"
 #else
-        "BGT.N	L_sp_2048_mul_64_store_%=\n\t"
+        "BGT.N	L_sp_2048_mul_64_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -2395,24 +2415,20 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x200\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_2048_sqr_64_outer_%=:\n\t"
+    "L_sp_2048_sqr_64_outer:\n\t"
         "SUBS	r3, r5, #0xfc\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_sqr_64_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_2048_sqr_64_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_2048_sqr_64_op_sqr_%=\n\t"
-#endif
+    "L_sp_2048_sqr_64_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -2422,59 +2438,51 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_2048_sqr_64_op_done_%=\n\t"
-        "\n"
-    "L_sp_2048_sqr_64_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_2048_sqr_64_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x100\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_2048_sqr_64_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_2048_sqr_64_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_2048_sqr_64_inner_done_%=\n\t"
+        "BGT	L_sp_2048_sqr_64_inner_done\n\t"
 #else
-        "BGT.N	L_sp_2048_sqr_64_inner_done_%=\n\t"
+        "BGT.N	L_sp_2048_sqr_64_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_sqr_64_inner_%=\n\t"
+        "BLT	L_sp_2048_sqr_64_inner\n\t"
 #else
-        "BLE.N	L_sp_2048_sqr_64_inner_%=\n\t"
+        "BLT.N	L_sp_2048_sqr_64_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_2048_sqr_64_inner_done_%=:\n\t"
+    "L_sp_2048_sqr_64_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x1f8\n\t"
+        "CMP	r5, #0x1f4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_sqr_64_outer_%=\n\t"
+        "BLE	L_sp_2048_sqr_64_outer\n\t"
 #else
-        "BLE.N	L_sp_2048_sqr_64_outer_%=\n\t"
+        "BLE.N	L_sp_2048_sqr_64_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #252]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_sqr_64_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_2048_sqr_64_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_2048_sqr_64_store_%=\n\t"
+        "BGT	L_sp_2048_sqr_64_store\n\t"
 #else
-        "BGT.N	L_sp_2048_sqr_64_store_%=\n\t"
+        "BGT.N	L_sp_2048_sqr_64_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -2524,7 +2532,7 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x80\n\t"
         "\n"
-    "L_sp_2048_add_32_word_%=:\n\t"
+    "L_sp_2048_add_32_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -2537,9 +2545,9 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_2048_add_32_word_%=\n\t"
+        "BNE	L_sp_2048_add_32_word\n\t"
 #else
-        "BNE.N	L_sp_2048_add_32_word_%=\n\t"
+        "BNE.N	L_sp_2048_add_32_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -2571,7 +2579,7 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x80\n\t"
         "\n"
-    "L_sp_2048_sub_in_pkace_32_word_%=:\n\t"
+    "L_sp_2048_sub_in_pkace_32_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -2583,9 +2591,9 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_2048_sub_in_pkace_32_word_%=\n\t"
+        "BNE	L_sp_2048_sub_in_pkace_32_word\n\t"
 #else
-        "BNE.N	L_sp_2048_sub_in_pkace_32_word_%=\n\t"
+        "BNE.N	L_sp_2048_sub_in_pkace_32_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -2617,61 +2625,80 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x100\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_2048_mul_32_outer_%=:\n\t"
+    "L_sp_2048_mul_32_outer:\n\t"
         "SUBS	r3, r5, #0x7c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_mul_32_inner_%=:\n\t"
+    "L_sp_2048_mul_32_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x80\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_2048_mul_32_inner_done_%=\n\t"
+        "BGT	L_sp_2048_mul_32_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_2048_mul_32_inner_done_%=\n\t"
+        "BGT.N	L_sp_2048_mul_32_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_mul_32_inner_%=\n\t"
+        "BLT	L_sp_2048_mul_32_inner\n\t"
 #else
-        "BLE.N	L_sp_2048_mul_32_inner_%=\n\t"
+        "BLT.N	L_sp_2048_mul_32_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_2048_mul_32_inner_done_%=:\n\t"
+    "L_sp_2048_mul_32_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0xf8\n\t"
+        "CMP	r5, #0xf4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_mul_32_outer_%=\n\t"
+        "BLE	L_sp_2048_mul_32_outer\n\t"
 #else
-        "BLE.N	L_sp_2048_mul_32_outer_%=\n\t"
+        "BLE.N	L_sp_2048_mul_32_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #124]\n\t"
+        "LDR	r11, [%[b], #124]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_mul_32_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_2048_mul_32_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_2048_mul_32_store_%=\n\t"
+        "BGT	L_sp_2048_mul_32_store\n\t"
 #else
-        "BGT.N	L_sp_2048_mul_32_store_%=\n\t"
+        "BGT.N	L_sp_2048_mul_32_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -2697,24 +2724,20 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x100\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_2048_sqr_32_outer_%=:\n\t"
+    "L_sp_2048_sqr_32_outer:\n\t"
         "SUBS	r3, r5, #0x7c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_2048_sqr_32_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_2048_sqr_32_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_2048_sqr_32_op_sqr_%=\n\t"
-#endif
+    "L_sp_2048_sqr_32_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -2724,59 +2747,51 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_2048_sqr_32_op_done_%=\n\t"
-        "\n"
-    "L_sp_2048_sqr_32_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_2048_sqr_32_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x80\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_2048_sqr_32_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_2048_sqr_32_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_2048_sqr_32_inner_done_%=\n\t"
+        "BGT	L_sp_2048_sqr_32_inner_done\n\t"
 #else
-        "BGT.N	L_sp_2048_sqr_32_inner_done_%=\n\t"
+        "BGT.N	L_sp_2048_sqr_32_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_sqr_32_inner_%=\n\t"
+        "BLT	L_sp_2048_sqr_32_inner\n\t"
 #else
-        "BLE.N	L_sp_2048_sqr_32_inner_%=\n\t"
+        "BLT.N	L_sp_2048_sqr_32_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_2048_sqr_32_inner_done_%=:\n\t"
+    "L_sp_2048_sqr_32_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0xf8\n\t"
+        "CMP	r5, #0xf4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_2048_sqr_32_outer_%=\n\t"
+        "BLE	L_sp_2048_sqr_32_outer\n\t"
 #else
-        "BLE.N	L_sp_2048_sqr_32_outer_%=\n\t"
+        "BLE.N	L_sp_2048_sqr_32_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #124]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_2048_sqr_32_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_2048_sqr_32_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_2048_sqr_32_store_%=\n\t"
+        "BGT	L_sp_2048_sqr_32_store\n\t"
 #else
-        "BGT.N	L_sp_2048_sqr_32_store_%=\n\t"
+        "BGT.N	L_sp_2048_sqr_32_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -2835,7 +2850,7 @@ static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_2048_mul_d_64_word_%=:\n\t"
+    "L_sp_2048_mul_d_64_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -2849,9 +2864,9 @@ static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x100\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mul_d_64_word_%=\n\t"
+        "BLT	L_sp_2048_mul_d_64_word\n\t"
 #else
-        "BLT.N	L_sp_2048_mul_d_64_word_%=\n\t"
+        "BLT.N	L_sp_2048_mul_d_64_word\n\t"
 #endif
         "STR	r3, [%[r], #256]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -3249,7 +3264,7 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_2048_cond_sub_32_words_%=:\n\t"
+    "L_sp_2048_cond_sub_32_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -3260,9 +3275,9 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_cond_sub_32_words_%=\n\t"
+        "BLT	L_sp_2048_cond_sub_32_words\n\t"
 #else
-        "BLT.N	L_sp_2048_cond_sub_32_words_%=\n\t"
+        "BLT.N	L_sp_2048_cond_sub_32_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -3445,7 +3460,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_32_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -3708,9 +3723,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x80\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_32_word\n\t"
 #else
-        "BLT.W	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT.W	L_sp_2048_mont_reduce_32_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -3749,7 +3764,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_32_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -3757,7 +3772,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_mul_%=:\n\t"
+    "L_sp_2048_mont_reduce_32_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -3800,9 +3815,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_32_mul_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_32_mul\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_32_mul_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_32_mul\n\t"
 #endif
         "LDR	r10, [%[a], #128]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -3816,9 +3831,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_32_word\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_32_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -3860,7 +3875,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_32_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -4028,9 +4043,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x80\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_32_word\n\t"
 #else
-        "BLT.W	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT.W	L_sp_2048_mont_reduce_32_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -4072,7 +4087,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_32_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -4080,7 +4095,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_32_mul_%=:\n\t"
+    "L_sp_2048_mont_reduce_32_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -4111,9 +4126,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_32_mul_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_32_mul\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_32_mul_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_32_mul\n\t"
 #endif
         "LDR	r10, [%[a], #128]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -4127,9 +4142,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_32_word\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_32_word_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_32_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -4200,7 +4215,7 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_2048_mul_d_32_word_%=:\n\t"
+    "L_sp_2048_mul_d_32_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -4214,9 +4229,9 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mul_d_32_word_%=\n\t"
+        "BLT	L_sp_2048_mul_d_32_word\n\t"
 #else
-        "BLT.N	L_sp_2048_mul_d_32_word_%=\n\t"
+        "BLT.N	L_sp_2048_mul_d_32_word\n\t"
 #endif
         "STR	r3, [%[r], #128]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -4423,9 +4438,9 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -4488,9 +4503,9 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -4514,7 +4529,7 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_2048_word_32_bit_%=:\n\t"
+    "L_div_2048_word_32_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -4524,7 +4539,7 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_2048_word_32_bit_%=\n\t"
+        "bpl	L_div_2048_word_32_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -4576,7 +4591,7 @@ static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0x7c\n\t"
         "\n"
-    "L_sp_2048_cmp_32_words_%=:\n\t"
+    "L_sp_2048_cmp_32_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -4589,7 +4604,7 @@ static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_2048_cmp_32_words_%=\n\t"
+        "bcs	L_sp_2048_cmp_32_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #124]\n\t"
@@ -5377,7 +5392,7 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_2048_cond_sub_64_words_%=:\n\t"
+    "L_sp_2048_cond_sub_64_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -5388,9 +5403,9 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x100\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_cond_sub_64_words_%=\n\t"
+        "BLT	L_sp_2048_cond_sub_64_words\n\t"
 #else
-        "BLT.N	L_sp_2048_cond_sub_64_words_%=\n\t"
+        "BLT.N	L_sp_2048_cond_sub_64_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -5685,7 +5700,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_64_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -6204,9 +6219,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x100\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_64_word\n\t"
 #else
-        "BLT.W	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT.W	L_sp_2048_mont_reduce_64_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -6245,7 +6260,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_64_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -6253,7 +6268,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_mul_%=:\n\t"
+    "L_sp_2048_mont_reduce_64_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -6296,9 +6311,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x100\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_64_mul_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_64_mul\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_64_mul_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_64_mul\n\t"
 #endif
         "LDR	r10, [%[a], #256]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -6312,9 +6327,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x100\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_64_word\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_64_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -6356,7 +6371,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_64_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -6684,9 +6699,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x100\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_64_word\n\t"
 #else
-        "BLT.W	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT.W	L_sp_2048_mont_reduce_64_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -6728,7 +6743,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_word_%=:\n\t"
+    "L_sp_2048_mont_reduce_64_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -6736,7 +6751,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_2048_mont_reduce_64_mul_%=:\n\t"
+    "L_sp_2048_mont_reduce_64_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -6767,9 +6782,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x100\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_64_mul_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_64_mul\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_64_mul_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_64_mul\n\t"
 #endif
         "LDR	r10, [%[a], #256]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -6783,9 +6798,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x100\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT	L_sp_2048_mont_reduce_64_word\n\t"
 #else
-        "BLT.N	L_sp_2048_mont_reduce_64_word_%=\n\t"
+        "BLT.N	L_sp_2048_mont_reduce_64_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -6851,7 +6866,7 @@ static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b
         "MOV	r11, #0x0\n\t"
         "ADD	r12, %[a], #0x100\n\t"
         "\n"
-    "L_sp_2048_sub_64_word_%=:\n\t"
+    "L_sp_2048_sub_64_word:\n\t"
         "RSBS	r11, r11, #0x0\n\t"
         "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
         "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -6863,9 +6878,9 @@ static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b
         "SBC	r11, r3, r3\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_2048_sub_64_word_%=\n\t"
+        "BNE	L_sp_2048_sub_64_word\n\t"
 #else
-        "BNE.N	L_sp_2048_sub_64_word_%=\n\t"
+        "BNE.N	L_sp_2048_sub_64_word\n\t"
 #endif
         "MOV	%[r], r11\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -7027,9 +7042,9 @@ static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -7092,9 +7107,9 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -7118,7 +7133,7 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_2048_word_64_bit_%=:\n\t"
+    "L_div_2048_word_64_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -7128,7 +7143,7 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_2048_word_64_bit_%=\n\t"
+        "bpl	L_div_2048_word_64_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -7283,7 +7298,7 @@ static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0xfc\n\t"
         "\n"
-    "L_sp_2048_cmp_64_words_%=:\n\t"
+    "L_sp_2048_cmp_64_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -7296,7 +7311,7 @@ static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_2048_cmp_64_words_%=\n\t"
+        "bcs	L_sp_2048_cmp_64_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #252]\n\t"
@@ -8559,7 +8574,7 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r8, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_2048_cond_add_32_words_%=:\n\t"
+    "L_sp_2048_cond_add_32_words:\n\t"
         "ADDS	r5, r5, #0xffffffff\n\t"
         "LDR	r6, [%[a], r4]\n\t"
         "LDR	r7, [%[b], r4]\n\t"
@@ -8570,9 +8585,9 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r4, r4, #0x4\n\t"
         "CMP	r4, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_2048_cond_add_32_words_%=\n\t"
+        "BLT	L_sp_2048_cond_add_32_words\n\t"
 #else
-        "BLT.N	L_sp_2048_cond_add_32_words_%=\n\t"
+        "BLT.N	L_sp_2048_cond_add_32_words\n\t"
 #endif
         "MOV	%[r], r5\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -12945,7 +12960,7 @@ static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x180\n\t"
         "\n"
-    "L_sp_3072_add_96_word_%=:\n\t"
+    "L_sp_3072_add_96_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -12958,9 +12973,9 @@ static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_3072_add_96_word_%=\n\t"
+        "BNE	L_sp_3072_add_96_word\n\t"
 #else
-        "BNE.N	L_sp_3072_add_96_word_%=\n\t"
+        "BNE.N	L_sp_3072_add_96_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -12992,7 +13007,7 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x180\n\t"
         "\n"
-    "L_sp_3072_sub_in_pkace_96_word_%=:\n\t"
+    "L_sp_3072_sub_in_pkace_96_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -13004,9 +13019,9 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_3072_sub_in_pkace_96_word_%=\n\t"
+        "BNE	L_sp_3072_sub_in_pkace_96_word\n\t"
 #else
-        "BNE.N	L_sp_3072_sub_in_pkace_96_word_%=\n\t"
+        "BNE.N	L_sp_3072_sub_in_pkace_96_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -13038,61 +13053,80 @@ static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x300\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_3072_mul_96_outer_%=:\n\t"
+    "L_sp_3072_mul_96_outer:\n\t"
         "SUBS	r3, r5, #0x17c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_mul_96_inner_%=:\n\t"
+    "L_sp_3072_mul_96_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x180\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_3072_mul_96_inner_done_%=\n\t"
+        "BGT	L_sp_3072_mul_96_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_3072_mul_96_inner_done_%=\n\t"
+        "BGT.N	L_sp_3072_mul_96_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_mul_96_inner_%=\n\t"
+        "BLT	L_sp_3072_mul_96_inner\n\t"
 #else
-        "BLE.N	L_sp_3072_mul_96_inner_%=\n\t"
+        "BLT.N	L_sp_3072_mul_96_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_3072_mul_96_inner_done_%=:\n\t"
+    "L_sp_3072_mul_96_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x2f8\n\t"
+        "CMP	r5, #0x2f4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_mul_96_outer_%=\n\t"
+        "BLE	L_sp_3072_mul_96_outer\n\t"
 #else
-        "BLE.N	L_sp_3072_mul_96_outer_%=\n\t"
+        "BLE.N	L_sp_3072_mul_96_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #380]\n\t"
+        "LDR	r11, [%[b], #380]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_mul_96_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_3072_mul_96_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_3072_mul_96_store_%=\n\t"
+        "BGT	L_sp_3072_mul_96_store\n\t"
 #else
-        "BGT.N	L_sp_3072_mul_96_store_%=\n\t"
+        "BGT.N	L_sp_3072_mul_96_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -13118,24 +13152,20 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x300\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_3072_sqr_96_outer_%=:\n\t"
+    "L_sp_3072_sqr_96_outer:\n\t"
         "SUBS	r3, r5, #0x17c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_sqr_96_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_3072_sqr_96_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_3072_sqr_96_op_sqr_%=\n\t"
-#endif
+    "L_sp_3072_sqr_96_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -13145,59 +13175,51 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_3072_sqr_96_op_done_%=\n\t"
-        "\n"
-    "L_sp_3072_sqr_96_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_3072_sqr_96_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x180\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_3072_sqr_96_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_3072_sqr_96_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_3072_sqr_96_inner_done_%=\n\t"
+        "BGT	L_sp_3072_sqr_96_inner_done\n\t"
 #else
-        "BGT.N	L_sp_3072_sqr_96_inner_done_%=\n\t"
+        "BGT.N	L_sp_3072_sqr_96_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_sqr_96_inner_%=\n\t"
+        "BLT	L_sp_3072_sqr_96_inner\n\t"
 #else
-        "BLE.N	L_sp_3072_sqr_96_inner_%=\n\t"
+        "BLT.N	L_sp_3072_sqr_96_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_3072_sqr_96_inner_done_%=:\n\t"
+    "L_sp_3072_sqr_96_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x2f8\n\t"
+        "CMP	r5, #0x2f4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_sqr_96_outer_%=\n\t"
+        "BLE	L_sp_3072_sqr_96_outer\n\t"
 #else
-        "BLE.N	L_sp_3072_sqr_96_outer_%=\n\t"
+        "BLE.N	L_sp_3072_sqr_96_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #380]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_sqr_96_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_3072_sqr_96_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_3072_sqr_96_store_%=\n\t"
+        "BGT	L_sp_3072_sqr_96_store\n\t"
 #else
-        "BGT.N	L_sp_3072_sqr_96_store_%=\n\t"
+        "BGT.N	L_sp_3072_sqr_96_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -13247,7 +13269,7 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0xc0\n\t"
         "\n"
-    "L_sp_3072_add_48_word_%=:\n\t"
+    "L_sp_3072_add_48_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -13260,9 +13282,9 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_3072_add_48_word_%=\n\t"
+        "BNE	L_sp_3072_add_48_word\n\t"
 #else
-        "BNE.N	L_sp_3072_add_48_word_%=\n\t"
+        "BNE.N	L_sp_3072_add_48_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -13294,7 +13316,7 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0xc0\n\t"
         "\n"
-    "L_sp_3072_sub_in_pkace_48_word_%=:\n\t"
+    "L_sp_3072_sub_in_pkace_48_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -13306,9 +13328,9 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_3072_sub_in_pkace_48_word_%=\n\t"
+        "BNE	L_sp_3072_sub_in_pkace_48_word\n\t"
 #else
-        "BNE.N	L_sp_3072_sub_in_pkace_48_word_%=\n\t"
+        "BNE.N	L_sp_3072_sub_in_pkace_48_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -13340,61 +13362,80 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x180\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_3072_mul_48_outer_%=:\n\t"
+    "L_sp_3072_mul_48_outer:\n\t"
         "SUBS	r3, r5, #0xbc\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_mul_48_inner_%=:\n\t"
+    "L_sp_3072_mul_48_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0xc0\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_3072_mul_48_inner_done_%=\n\t"
+        "BGT	L_sp_3072_mul_48_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_3072_mul_48_inner_done_%=\n\t"
+        "BGT.N	L_sp_3072_mul_48_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_mul_48_inner_%=\n\t"
+        "BLT	L_sp_3072_mul_48_inner\n\t"
 #else
-        "BLE.N	L_sp_3072_mul_48_inner_%=\n\t"
+        "BLT.N	L_sp_3072_mul_48_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_3072_mul_48_inner_done_%=:\n\t"
+    "L_sp_3072_mul_48_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x178\n\t"
+        "CMP	r5, #0x174\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_mul_48_outer_%=\n\t"
+        "BLE	L_sp_3072_mul_48_outer\n\t"
 #else
-        "BLE.N	L_sp_3072_mul_48_outer_%=\n\t"
+        "BLE.N	L_sp_3072_mul_48_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #188]\n\t"
+        "LDR	r11, [%[b], #188]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_mul_48_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_3072_mul_48_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_3072_mul_48_store_%=\n\t"
+        "BGT	L_sp_3072_mul_48_store\n\t"
 #else
-        "BGT.N	L_sp_3072_mul_48_store_%=\n\t"
+        "BGT.N	L_sp_3072_mul_48_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -13420,24 +13461,20 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x180\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_3072_sqr_48_outer_%=:\n\t"
+    "L_sp_3072_sqr_48_outer:\n\t"
         "SUBS	r3, r5, #0xbc\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_3072_sqr_48_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_3072_sqr_48_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_3072_sqr_48_op_sqr_%=\n\t"
-#endif
+    "L_sp_3072_sqr_48_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -13447,59 +13484,51 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_3072_sqr_48_op_done_%=\n\t"
-        "\n"
-    "L_sp_3072_sqr_48_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_3072_sqr_48_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0xc0\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_3072_sqr_48_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_3072_sqr_48_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_3072_sqr_48_inner_done_%=\n\t"
+        "BGT	L_sp_3072_sqr_48_inner_done\n\t"
 #else
-        "BGT.N	L_sp_3072_sqr_48_inner_done_%=\n\t"
+        "BGT.N	L_sp_3072_sqr_48_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_sqr_48_inner_%=\n\t"
+        "BLT	L_sp_3072_sqr_48_inner\n\t"
 #else
-        "BLE.N	L_sp_3072_sqr_48_inner_%=\n\t"
+        "BLT.N	L_sp_3072_sqr_48_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_3072_sqr_48_inner_done_%=:\n\t"
+    "L_sp_3072_sqr_48_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x178\n\t"
+        "CMP	r5, #0x174\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_3072_sqr_48_outer_%=\n\t"
+        "BLE	L_sp_3072_sqr_48_outer\n\t"
 #else
-        "BLE.N	L_sp_3072_sqr_48_outer_%=\n\t"
+        "BLE.N	L_sp_3072_sqr_48_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #188]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_3072_sqr_48_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_3072_sqr_48_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_3072_sqr_48_store_%=\n\t"
+        "BGT	L_sp_3072_sqr_48_store\n\t"
 #else
-        "BGT.N	L_sp_3072_sqr_48_store_%=\n\t"
+        "BGT.N	L_sp_3072_sqr_48_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -13558,7 +13587,7 @@ static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_3072_mul_d_96_word_%=:\n\t"
+    "L_sp_3072_mul_d_96_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -13572,9 +13601,9 @@ static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x180\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mul_d_96_word_%=\n\t"
+        "BLT	L_sp_3072_mul_d_96_word\n\t"
 #else
-        "BLT.N	L_sp_3072_mul_d_96_word_%=\n\t"
+        "BLT.N	L_sp_3072_mul_d_96_word\n\t"
 #endif
         "STR	r3, [%[r], #384]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -14132,7 +14161,7 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_3072_cond_sub_48_words_%=:\n\t"
+    "L_sp_3072_cond_sub_48_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -14143,9 +14172,9 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0xc0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_cond_sub_48_words_%=\n\t"
+        "BLT	L_sp_3072_cond_sub_48_words\n\t"
 #else
-        "BLT.N	L_sp_3072_cond_sub_48_words_%=\n\t"
+        "BLT.N	L_sp_3072_cond_sub_48_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -14384,7 +14413,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_48_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -14775,9 +14804,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0xc0\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_48_word\n\t"
 #else
-        "BLT.W	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT.W	L_sp_3072_mont_reduce_48_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -14816,7 +14845,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_48_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -14824,7 +14853,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_mul_%=:\n\t"
+    "L_sp_3072_mont_reduce_48_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -14867,9 +14896,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0xc0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_48_mul_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_48_mul\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_48_mul_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_48_mul\n\t"
 #endif
         "LDR	r10, [%[a], #192]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -14883,9 +14912,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0xc0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_48_word\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_48_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -14927,7 +14956,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_48_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -15175,9 +15204,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0xc0\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_48_word\n\t"
 #else
-        "BLT.W	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT.W	L_sp_3072_mont_reduce_48_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -15219,7 +15248,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_48_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -15227,7 +15256,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_48_mul_%=:\n\t"
+    "L_sp_3072_mont_reduce_48_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -15258,9 +15287,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0xc0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_48_mul_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_48_mul\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_48_mul_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_48_mul\n\t"
 #endif
         "LDR	r10, [%[a], #192]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -15274,9 +15303,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0xc0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_48_word\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_48_word_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_48_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -15347,7 +15376,7 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_3072_mul_d_48_word_%=:\n\t"
+    "L_sp_3072_mul_d_48_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -15361,9 +15390,9 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0xc0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mul_d_48_word_%=\n\t"
+        "BLT	L_sp_3072_mul_d_48_word\n\t"
 #else
-        "BLT.N	L_sp_3072_mul_d_48_word_%=\n\t"
+        "BLT.N	L_sp_3072_mul_d_48_word\n\t"
 #endif
         "STR	r3, [%[r], #192]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -15650,9 +15679,9 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -15715,9 +15744,9 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -15741,7 +15770,7 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_3072_word_48_bit_%=:\n\t"
+    "L_div_3072_word_48_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -15751,7 +15780,7 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_3072_word_48_bit_%=\n\t"
+        "bpl	L_div_3072_word_48_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -15803,7 +15832,7 @@ static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0xbc\n\t"
         "\n"
-    "L_sp_3072_cmp_48_words_%=:\n\t"
+    "L_sp_3072_cmp_48_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -15816,7 +15845,7 @@ static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_3072_cmp_48_words_%=\n\t"
+        "bcs	L_sp_3072_cmp_48_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #188]\n\t"
@@ -16780,7 +16809,7 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_3072_cond_sub_96_words_%=:\n\t"
+    "L_sp_3072_cond_sub_96_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -16791,9 +16820,9 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x180\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_cond_sub_96_words_%=\n\t"
+        "BLT	L_sp_3072_cond_sub_96_words\n\t"
 #else
-        "BLT.N	L_sp_3072_cond_sub_96_words_%=\n\t"
+        "BLT.N	L_sp_3072_cond_sub_96_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -17200,7 +17229,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_96_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -17975,9 +18004,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x180\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_96_word\n\t"
 #else
-        "BLT.W	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT.W	L_sp_3072_mont_reduce_96_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -18016,7 +18045,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_96_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -18024,7 +18053,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_mul_%=:\n\t"
+    "L_sp_3072_mont_reduce_96_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -18067,9 +18096,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x180\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_96_mul_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_96_mul\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_96_mul_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_96_mul\n\t"
 #endif
         "LDR	r10, [%[a], #384]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -18083,9 +18112,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x180\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_96_word\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_96_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -18127,7 +18156,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_96_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -18615,9 +18644,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x180\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_96_word\n\t"
 #else
-        "BLT.W	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT.W	L_sp_3072_mont_reduce_96_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -18659,7 +18688,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_word_%=:\n\t"
+    "L_sp_3072_mont_reduce_96_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -18667,7 +18696,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_3072_mont_reduce_96_mul_%=:\n\t"
+    "L_sp_3072_mont_reduce_96_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -18698,9 +18727,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x180\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_96_mul_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_96_mul\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_96_mul_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_96_mul\n\t"
 #endif
         "LDR	r10, [%[a], #384]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -18714,9 +18743,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x180\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT	L_sp_3072_mont_reduce_96_word\n\t"
 #else
-        "BLT.N	L_sp_3072_mont_reduce_96_word_%=\n\t"
+        "BLT.N	L_sp_3072_mont_reduce_96_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -18782,7 +18811,7 @@ static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b
         "MOV	r11, #0x0\n\t"
         "ADD	r12, %[a], #0x180\n\t"
         "\n"
-    "L_sp_3072_sub_96_word_%=:\n\t"
+    "L_sp_3072_sub_96_word:\n\t"
         "RSBS	r11, r11, #0x0\n\t"
         "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
         "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -18794,9 +18823,9 @@ static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b
         "SBC	r11, r3, r3\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_3072_sub_96_word_%=\n\t"
+        "BNE	L_sp_3072_sub_96_word\n\t"
 #else
-        "BNE.N	L_sp_3072_sub_96_word_%=\n\t"
+        "BNE.N	L_sp_3072_sub_96_word\n\t"
 #endif
         "MOV	%[r], r11\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -19014,9 +19043,9 @@ static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -19079,9 +19108,9 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -19105,7 +19134,7 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_3072_word_96_bit_%=:\n\t"
+    "L_div_3072_word_96_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -19115,7 +19144,7 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_3072_word_96_bit_%=\n\t"
+        "bpl	L_div_3072_word_96_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -19270,7 +19299,7 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0x17c\n\t"
         "\n"
-    "L_sp_3072_cmp_96_words_%=:\n\t"
+    "L_sp_3072_cmp_96_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -19283,7 +19312,7 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_3072_cmp_96_words_%=\n\t"
+        "bcs	L_sp_3072_cmp_96_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #380]\n\t"
@@ -20898,7 +20927,7 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r8, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_3072_cond_add_48_words_%=:\n\t"
+    "L_sp_3072_cond_add_48_words:\n\t"
         "ADDS	r5, r5, #0xffffffff\n\t"
         "LDR	r6, [%[a], r4]\n\t"
         "LDR	r7, [%[b], r4]\n\t"
@@ -20909,9 +20938,9 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r4, r4, #0x4\n\t"
         "CMP	r4, #0xc0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_3072_cond_add_48_words_%=\n\t"
+        "BLT	L_sp_3072_cond_add_48_words\n\t"
 #else
-        "BLT.N	L_sp_3072_cond_add_48_words_%=\n\t"
+        "BLT.N	L_sp_3072_cond_add_48_words\n\t"
 #endif
         "MOV	%[r], r5\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -23042,7 +23071,7 @@ static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, const sp_digit*
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x200\n\t"
         "\n"
-    "L_sp_4096_add_128_word_%=:\n\t"
+    "L_sp_4096_add_128_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -23055,9 +23084,9 @@ static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, const sp_digit*
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_4096_add_128_word_%=\n\t"
+        "BNE	L_sp_4096_add_128_word\n\t"
 #else
-        "BNE.N	L_sp_4096_add_128_word_%=\n\t"
+        "BNE.N	L_sp_4096_add_128_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -23089,7 +23118,7 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x200\n\t"
         "\n"
-    "L_sp_4096_sub_in_pkace_128_word_%=:\n\t"
+    "L_sp_4096_sub_in_pkace_128_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -23101,9 +23130,9 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_4096_sub_in_pkace_128_word_%=\n\t"
+        "BNE	L_sp_4096_sub_in_pkace_128_word\n\t"
 #else
-        "BNE.N	L_sp_4096_sub_in_pkace_128_word_%=\n\t"
+        "BNE.N	L_sp_4096_sub_in_pkace_128_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -23135,61 +23164,80 @@ static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x400\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_4096_mul_128_outer_%=:\n\t"
+    "L_sp_4096_mul_128_outer:\n\t"
         "SUBS	r3, r5, #0x1fc\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_4096_mul_128_inner_%=:\n\t"
+    "L_sp_4096_mul_128_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x200\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_4096_mul_128_inner_done_%=\n\t"
+        "BGT	L_sp_4096_mul_128_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_4096_mul_128_inner_done_%=\n\t"
+        "BGT.N	L_sp_4096_mul_128_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_4096_mul_128_inner_%=\n\t"
+        "BLT	L_sp_4096_mul_128_inner\n\t"
 #else
-        "BLE.N	L_sp_4096_mul_128_inner_%=\n\t"
+        "BLT.N	L_sp_4096_mul_128_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_4096_mul_128_inner_done_%=:\n\t"
+    "L_sp_4096_mul_128_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x3f8\n\t"
+        "CMP	r5, #0x3f4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_4096_mul_128_outer_%=\n\t"
+        "BLE	L_sp_4096_mul_128_outer\n\t"
 #else
-        "BLE.N	L_sp_4096_mul_128_outer_%=\n\t"
+        "BLE.N	L_sp_4096_mul_128_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #508]\n\t"
+        "LDR	r11, [%[b], #508]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_4096_mul_128_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_4096_mul_128_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_4096_mul_128_store_%=\n\t"
+        "BGT	L_sp_4096_mul_128_store\n\t"
 #else
-        "BGT.N	L_sp_4096_mul_128_store_%=\n\t"
+        "BGT.N	L_sp_4096_mul_128_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -23215,24 +23263,20 @@ static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x400\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_4096_sqr_128_outer_%=:\n\t"
+    "L_sp_4096_sqr_128_outer:\n\t"
         "SUBS	r3, r5, #0x1fc\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_4096_sqr_128_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_4096_sqr_128_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_4096_sqr_128_op_sqr_%=\n\t"
-#endif
+    "L_sp_4096_sqr_128_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -23242,59 +23286,51 @@ static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_4096_sqr_128_op_done_%=\n\t"
-        "\n"
-    "L_sp_4096_sqr_128_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_4096_sqr_128_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x200\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_4096_sqr_128_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_4096_sqr_128_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_4096_sqr_128_inner_done_%=\n\t"
+        "BGT	L_sp_4096_sqr_128_inner_done\n\t"
 #else
-        "BGT.N	L_sp_4096_sqr_128_inner_done_%=\n\t"
+        "BGT.N	L_sp_4096_sqr_128_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_4096_sqr_128_inner_%=\n\t"
+        "BLT	L_sp_4096_sqr_128_inner\n\t"
 #else
-        "BLE.N	L_sp_4096_sqr_128_inner_%=\n\t"
+        "BLT.N	L_sp_4096_sqr_128_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_4096_sqr_128_inner_done_%=:\n\t"
+    "L_sp_4096_sqr_128_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x3f8\n\t"
+        "CMP	r5, #0x3f4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_4096_sqr_128_outer_%=\n\t"
+        "BLE	L_sp_4096_sqr_128_outer\n\t"
 #else
-        "BLE.N	L_sp_4096_sqr_128_outer_%=\n\t"
+        "BLE.N	L_sp_4096_sqr_128_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #508]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_4096_sqr_128_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_4096_sqr_128_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_4096_sqr_128_store_%=\n\t"
+        "BGT	L_sp_4096_sqr_128_store\n\t"
 #else
-        "BGT.N	L_sp_4096_sqr_128_store_%=\n\t"
+        "BGT.N	L_sp_4096_sqr_128_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -23351,7 +23387,7 @@ static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_4096_mul_d_128_word_%=:\n\t"
+    "L_sp_4096_mul_d_128_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -23365,9 +23401,9 @@ static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x200\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_4096_mul_d_128_word_%=\n\t"
+        "BLT	L_sp_4096_mul_d_128_word\n\t"
 #else
-        "BLT.N	L_sp_4096_mul_d_128_word_%=\n\t"
+        "BLT.N	L_sp_4096_mul_d_128_word\n\t"
 #endif
         "STR	r3, [%[r], #512]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -24086,7 +24122,7 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a, const sp_di
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_4096_cond_sub_128_words_%=:\n\t"
+    "L_sp_4096_cond_sub_128_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -24097,9 +24133,9 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a, const sp_di
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x200\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_4096_cond_sub_128_words_%=\n\t"
+        "BLT	L_sp_4096_cond_sub_128_words\n\t"
 #else
-        "BLT.N	L_sp_4096_cond_sub_128_words_%=\n\t"
+        "BLT.N	L_sp_4096_cond_sub_128_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -24618,7 +24654,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
+    "L_sp_4096_mont_reduce_128_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -25649,9 +25685,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x200\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT	L_sp_4096_mont_reduce_128_word\n\t"
 #else
-        "BLT.W	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT.W	L_sp_4096_mont_reduce_128_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -25690,7 +25726,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
+    "L_sp_4096_mont_reduce_128_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -25698,7 +25734,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_mul_%=:\n\t"
+    "L_sp_4096_mont_reduce_128_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -25741,9 +25777,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x200\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_4096_mont_reduce_128_mul_%=\n\t"
+        "BLT	L_sp_4096_mont_reduce_128_mul\n\t"
 #else
-        "BLT.N	L_sp_4096_mont_reduce_128_mul_%=\n\t"
+        "BLT.N	L_sp_4096_mont_reduce_128_mul\n\t"
 #endif
         "LDR	r10, [%[a], #512]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -25757,9 +25793,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x200\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT	L_sp_4096_mont_reduce_128_word\n\t"
 #else
-        "BLT.N	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT.N	L_sp_4096_mont_reduce_128_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -25801,7 +25837,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
+    "L_sp_4096_mont_reduce_128_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -26449,9 +26485,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x200\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT	L_sp_4096_mont_reduce_128_word\n\t"
 #else
-        "BLT.W	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT.W	L_sp_4096_mont_reduce_128_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -26493,7 +26529,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         /* ca = 0 */
         "MOV	r3, #0x0\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_word_%=:\n\t"
+    "L_sp_4096_mont_reduce_128_word:\n\t"
         /* mu = a[i] * mp */
         "LDR	r10, [%[a]]\n\t"
         "MUL	r8, %[mp], r10\n\t"
@@ -26501,7 +26537,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "MOV	r12, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_4096_mont_reduce_128_mul_%=:\n\t"
+    "L_sp_4096_mont_reduce_128_mul:\n\t"
         /* a[i+j+0] += m[j+0] * mu */
         "LDR	r7, [%[m], r12]\n\t"
         "LDR	r10, [%[a], r12]\n\t"
@@ -26532,9 +26568,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	r12, r12, #0x4\n\t"
         "CMP	r12, #0x200\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_4096_mont_reduce_128_mul_%=\n\t"
+        "BLT	L_sp_4096_mont_reduce_128_mul\n\t"
 #else
-        "BLT.N	L_sp_4096_mont_reduce_128_mul_%=\n\t"
+        "BLT.N	L_sp_4096_mont_reduce_128_mul\n\t"
 #endif
         "LDR	r10, [%[a], #512]\n\t"
         "ADDS	r4, r4, r3\n\t"
@@ -26548,9 +26584,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r9, #0x200\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT	L_sp_4096_mont_reduce_128_word\n\t"
 #else
-        "BLT.N	L_sp_4096_mont_reduce_128_word_%=\n\t"
+        "BLT.N	L_sp_4096_mont_reduce_128_word\n\t"
 #endif
         /* Loop Done */
         "MOV	%[mp], r3\n\t"
@@ -26616,7 +26652,7 @@ static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit*
         "MOV	r11, #0x0\n\t"
         "ADD	r12, %[a], #0x200\n\t"
         "\n"
-    "L_sp_4096_sub_128_word_%=:\n\t"
+    "L_sp_4096_sub_128_word:\n\t"
         "RSBS	r11, r11, #0x0\n\t"
         "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
         "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -26628,9 +26664,9 @@ static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit*
         "SBC	r11, r3, r3\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_4096_sub_128_word_%=\n\t"
+        "BNE	L_sp_4096_sub_128_word\n\t"
 #else
-        "BNE.N	L_sp_4096_sub_128_word_%=\n\t"
+        "BNE.N	L_sp_4096_sub_128_word\n\t"
 #endif
         "MOV	%[r], r11\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -26904,9 +26940,9 @@ static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit*
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -26969,9 +27005,9 @@ static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -26995,7 +27031,7 @@ static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_4096_word_128_bit_%=:\n\t"
+    "L_div_4096_word_128_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -27005,7 +27041,7 @@ static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_4096_word_128_bit_%=\n\t"
+        "bpl	L_div_4096_word_128_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -27160,7 +27196,7 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0x1fc\n\t"
         "\n"
-    "L_sp_4096_cmp_128_words_%=:\n\t"
+    "L_sp_4096_cmp_128_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -27173,7 +27209,7 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_4096_cmp_128_words_%=\n\t"
+        "bcs	L_sp_4096_cmp_128_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #508]\n\t"
@@ -29140,7 +29176,7 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r8, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_4096_cond_add_64_words_%=:\n\t"
+    "L_sp_4096_cond_add_64_words:\n\t"
         "ADDS	r5, r5, #0xffffffff\n\t"
         "LDR	r6, [%[a], r4]\n\t"
         "LDR	r7, [%[b], r4]\n\t"
@@ -29151,9 +29187,9 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r4, r4, #0x4\n\t"
         "CMP	r4, #0x100\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_4096_cond_add_64_words_%=\n\t"
+        "BLT	L_sp_4096_cond_add_64_words\n\t"
 #else
-        "BLT.N	L_sp_4096_cond_add_64_words_%=\n\t"
+        "BLT.N	L_sp_4096_cond_add_64_words\n\t"
 #endif
         "MOV	%[r], r5\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -30825,61 +30861,80 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x40\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_256_mul_8_outer_%=:\n\t"
+    "L_sp_256_mul_8_outer:\n\t"
         "SUBS	r3, r5, #0x1c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_256_mul_8_inner_%=:\n\t"
+    "L_sp_256_mul_8_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x20\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_mul_8_inner_done_%=\n\t"
+        "BGT	L_sp_256_mul_8_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_256_mul_8_inner_done_%=\n\t"
+        "BGT.N	L_sp_256_mul_8_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_256_mul_8_inner_%=\n\t"
+        "BLT	L_sp_256_mul_8_inner\n\t"
 #else
-        "BLE.N	L_sp_256_mul_8_inner_%=\n\t"
+        "BLT.N	L_sp_256_mul_8_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_256_mul_8_inner_done_%=:\n\t"
+    "L_sp_256_mul_8_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x38\n\t"
+        "CMP	r5, #0x34\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_256_mul_8_outer_%=\n\t"
+        "BLE	L_sp_256_mul_8_outer\n\t"
 #else
-        "BLE.N	L_sp_256_mul_8_outer_%=\n\t"
+        "BLE.N	L_sp_256_mul_8_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #28]\n\t"
+        "LDR	r11, [%[b], #28]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_256_mul_8_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_256_mul_8_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_256_mul_8_store_%=\n\t"
+        "BGT	L_sp_256_mul_8_store\n\t"
 #else
-        "BGT.N	L_sp_256_mul_8_store_%=\n\t"
+        "BGT.N	L_sp_256_mul_8_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -31411,24 +31466,20 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x40\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_256_sqr_8_outer_%=:\n\t"
+    "L_sp_256_sqr_8_outer:\n\t"
         "SUBS	r3, r5, #0x1c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_256_sqr_8_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_sqr_8_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_256_sqr_8_op_sqr_%=\n\t"
-#endif
+    "L_sp_256_sqr_8_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -31438,59 +31489,51 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_256_sqr_8_op_done_%=\n\t"
-        "\n"
-    "L_sp_256_sqr_8_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_256_sqr_8_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x20\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_sqr_8_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_256_sqr_8_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_256_sqr_8_inner_done_%=\n\t"
+        "BGT	L_sp_256_sqr_8_inner_done\n\t"
 #else
-        "BGT.N	L_sp_256_sqr_8_inner_done_%=\n\t"
+        "BGT.N	L_sp_256_sqr_8_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_256_sqr_8_inner_%=\n\t"
+        "BLT	L_sp_256_sqr_8_inner\n\t"
 #else
-        "BLE.N	L_sp_256_sqr_8_inner_%=\n\t"
+        "BLT.N	L_sp_256_sqr_8_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_256_sqr_8_inner_done_%=:\n\t"
+    "L_sp_256_sqr_8_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x38\n\t"
+        "CMP	r5, #0x34\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_256_sqr_8_outer_%=\n\t"
+        "BLE	L_sp_256_sqr_8_outer\n\t"
 #else
-        "BLE.N	L_sp_256_sqr_8_outer_%=\n\t"
+        "BLE.N	L_sp_256_sqr_8_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #28]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_256_sqr_8_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_256_sqr_8_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_256_sqr_8_store_%=\n\t"
+        "BGT	L_sp_256_sqr_8_store\n\t"
 #else
-        "BGT.N	L_sp_256_sqr_8_store_%=\n\t"
+        "BGT.N	L_sp_256_sqr_8_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -31896,7 +31939,7 @@ static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x20\n\t"
         "\n"
-    "L_sp_256_add_8_word_%=:\n\t"
+    "L_sp_256_add_8_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -31909,9 +31952,9 @@ static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_256_add_8_word_%=\n\t"
+        "BNE	L_sp_256_add_8_word\n\t"
 #else
-        "BNE.N	L_sp_256_add_8_word_%=\n\t"
+        "BNE.N	L_sp_256_add_8_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -33818,8 +33861,8 @@ static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const sp_digit* m,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_8(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_8(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_8(r, a, m, mp);
     for (; n > 1; n--) {
@@ -33931,7 +33974,7 @@ static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0x1c\n\t"
         "\n"
-    "L_sp_256_cmp_8_words_%=:\n\t"
+    "L_sp_256_cmp_8_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -33944,7 +33987,7 @@ static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_256_cmp_8_words_%=\n\t"
+        "bcs	L_sp_256_cmp_8_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #28]\n\t"
@@ -34078,7 +34121,7 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a, const sp_digit
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_256_cond_sub_8_words_%=:\n\t"
+    "L_sp_256_cond_sub_8_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -34089,9 +34132,9 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a, const sp_digit
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_256_cond_sub_8_words_%=\n\t"
+        "BLT	L_sp_256_cond_sub_8_words\n\t"
 #else
-        "BLT.N	L_sp_256_cond_sub_8_words_%=\n\t"
+        "BLT.N	L_sp_256_cond_sub_8_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -34192,7 +34235,7 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_8_word_%=:\n\t"
+    "L_sp_256_mont_reduce_8_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -34263,9 +34306,9 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x20\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_256_mont_reduce_8_word_%=\n\t"
+        "BLT	L_sp_256_mont_reduce_8_word\n\t"
 #else
-        "BLT.W	L_sp_256_mont_reduce_8_word_%=\n\t"
+        "BLT.W	L_sp_256_mont_reduce_8_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -34307,7 +34350,7 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_8_word_%=:\n\t"
+    "L_sp_256_mont_reduce_8_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -34355,9 +34398,9 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x20\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_256_mont_reduce_8_word_%=\n\t"
+        "BLT	L_sp_256_mont_reduce_8_word\n\t"
 #else
-        "BLT.W	L_sp_256_mont_reduce_8_word_%=\n\t"
+        "BLT.W	L_sp_256_mont_reduce_8_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -34566,7 +34609,7 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_order_8_word_%=:\n\t"
+    "L_sp_256_mont_reduce_order_8_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -34637,9 +34680,9 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x20\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_256_mont_reduce_order_8_word_%=\n\t"
+        "BLT	L_sp_256_mont_reduce_order_8_word\n\t"
 #else
-        "BLT.W	L_sp_256_mont_reduce_order_8_word_%=\n\t"
+        "BLT.W	L_sp_256_mont_reduce_order_8_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -34681,7 +34724,7 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_256_mont_reduce_order_8_word_%=:\n\t"
+    "L_sp_256_mont_reduce_order_8_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -34729,9 +34772,9 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x20\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_256_mont_reduce_order_8_word_%=\n\t"
+        "BLT	L_sp_256_mont_reduce_order_8_word\n\t"
 #else
-        "BLT.W	L_sp_256_mont_reduce_order_8_word_%=\n\t"
+        "BLT.W	L_sp_256_mont_reduce_order_8_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -39060,7 +39103,7 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x20\n\t"
         "\n"
-    "L_sp_256_sub_in_pkace_8_word_%=:\n\t"
+    "L_sp_256_sub_in_pkace_8_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -39072,9 +39115,9 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_256_sub_in_pkace_8_word_%=\n\t"
+        "BNE	L_sp_256_sub_in_pkace_8_word\n\t"
 #else
-        "BNE.N	L_sp_256_sub_in_pkace_8_word_%=\n\t"
+        "BNE.N	L_sp_256_sub_in_pkace_8_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -39153,7 +39196,7 @@ static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_256_mul_d_8_word_%=:\n\t"
+    "L_sp_256_mul_d_8_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -39167,9 +39210,9 @@ static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_256_mul_d_8_word_%=\n\t"
+        "BLT	L_sp_256_mul_d_8_word\n\t"
 #else
-        "BLT.N	L_sp_256_mul_d_8_word_%=\n\t"
+        "BLT.N	L_sp_256_mul_d_8_word\n\t"
 #endif
         "STR	r3, [%[r], #32]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -39256,9 +39299,9 @@ static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, sp_digit b)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -39321,9 +39364,9 @@ static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -39347,7 +39390,7 @@ static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_256_word_8_bit_%=:\n\t"
+    "L_div_256_word_8_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -39357,7 +39400,7 @@ static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_256_word_8_bit_%=\n\t"
+        "bpl	L_div_256_word_8_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -40051,7 +40094,7 @@ static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "MOV	r11, #0x0\n\t"
         "ADD	r12, %[a], #0x20\n\t"
         "\n"
-    "L_sp_256_sub_8_word_%=:\n\t"
+    "L_sp_256_sub_8_word:\n\t"
         "RSBS	r11, r11, #0x0\n\t"
         "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
         "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -40063,9 +40106,9 @@ static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "SBC	r11, r3, r3\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_256_sub_8_word_%=\n\t"
+        "BNE	L_sp_256_sub_8_word\n\t"
 #else
-        "BNE.N	L_sp_256_sub_8_word_%=\n\t"
+        "BNE.N	L_sp_256_sub_8_word\n\t"
 #endif
         "MOV	%[r], r11\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -40185,9 +40228,9 @@ static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
         "LDM	%[a]!, {r4}\n\t"
         "ANDS	r3, r4, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_div2_mod_8_even_%=\n\t"
+        "BEQ	L_sp_256_div2_mod_8_even\n\t"
 #else
-        "BEQ.N	L_sp_256_div2_mod_8_even_%=\n\t"
+        "BEQ.N	L_sp_256_div2_mod_8_even\n\t"
 #endif
         "LDM	%[a]!, {r5, r6, r7}\n\t"
         "LDM	%[m]!, {r8, r9, r10, r11}\n\t"
@@ -40203,13 +40246,17 @@ static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
         "ADCS	r6, r6, r10\n\t"
         "ADCS	r7, r7, r11\n\t"
         "ADC	r3, r12, r12\n\t"
-        "B	L_sp_256_div2_mod_8_div2_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_div2_mod_8_div2\n\t"
+#else
+        "B.N	L_sp_256_div2_mod_8_div2\n\t"
+#endif
         "\n"
-    "L_sp_256_div2_mod_8_even_%=:\n\t"
+    "L_sp_256_div2_mod_8_even:\n\t"
         "LDRD	r4, r5, [%[a], #12]\n\t"
         "LDRD	r6, r7, [%[a], #20]\n\t"
         "\n"
-    "L_sp_256_div2_mod_8_div2_%=:\n\t"
+    "L_sp_256_div2_mod_8_div2:\n\t"
         "LSR	r8, r4, #1\n\t"
         "AND	r4, r4, #0x1\n\t"
         "LSR	r9, r5, #1\n\t"
@@ -40252,100 +40299,128 @@ static int sp_256_num_bits_8(const sp_digit* a)
         "LDR	r1, [%[a], #28]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_num_bits_8_7_%=\n\t"
+        "BEQ	L_sp_256_num_bits_8_7\n\t"
 #else
-        "BEQ.N	L_sp_256_num_bits_8_7_%=\n\t"
+        "BEQ.N	L_sp_256_num_bits_8_7\n\t"
 #endif
         "MOV	r2, #0x100\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_256_num_bits_8_9_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_num_bits_8_9\n\t"
+#else
+        "B.N	L_sp_256_num_bits_8_9\n\t"
+#endif
         "\n"
-    "L_sp_256_num_bits_8_7_%=:\n\t"
+    "L_sp_256_num_bits_8_7:\n\t"
         "LDR	r1, [%[a], #24]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_num_bits_8_6_%=\n\t"
+        "BEQ	L_sp_256_num_bits_8_6\n\t"
 #else
-        "BEQ.N	L_sp_256_num_bits_8_6_%=\n\t"
+        "BEQ.N	L_sp_256_num_bits_8_6\n\t"
 #endif
         "MOV	r2, #0xe0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_256_num_bits_8_9_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_num_bits_8_9\n\t"
+#else
+        "B.N	L_sp_256_num_bits_8_9\n\t"
+#endif
         "\n"
-    "L_sp_256_num_bits_8_6_%=:\n\t"
+    "L_sp_256_num_bits_8_6:\n\t"
         "LDR	r1, [%[a], #20]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_num_bits_8_5_%=\n\t"
+        "BEQ	L_sp_256_num_bits_8_5\n\t"
 #else
-        "BEQ.N	L_sp_256_num_bits_8_5_%=\n\t"
+        "BEQ.N	L_sp_256_num_bits_8_5\n\t"
 #endif
         "MOV	r2, #0xc0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_256_num_bits_8_9_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_num_bits_8_9\n\t"
+#else
+        "B.N	L_sp_256_num_bits_8_9\n\t"
+#endif
         "\n"
-    "L_sp_256_num_bits_8_5_%=:\n\t"
+    "L_sp_256_num_bits_8_5:\n\t"
         "LDR	r1, [%[a], #16]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_num_bits_8_4_%=\n\t"
+        "BEQ	L_sp_256_num_bits_8_4\n\t"
 #else
-        "BEQ.N	L_sp_256_num_bits_8_4_%=\n\t"
+        "BEQ.N	L_sp_256_num_bits_8_4\n\t"
 #endif
         "MOV	r2, #0xa0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_256_num_bits_8_9_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_num_bits_8_9\n\t"
+#else
+        "B.N	L_sp_256_num_bits_8_9\n\t"
+#endif
         "\n"
-    "L_sp_256_num_bits_8_4_%=:\n\t"
+    "L_sp_256_num_bits_8_4:\n\t"
         "LDR	r1, [%[a], #12]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_num_bits_8_3_%=\n\t"
+        "BEQ	L_sp_256_num_bits_8_3\n\t"
 #else
-        "BEQ.N	L_sp_256_num_bits_8_3_%=\n\t"
+        "BEQ.N	L_sp_256_num_bits_8_3\n\t"
 #endif
         "MOV	r2, #0x80\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_256_num_bits_8_9_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_num_bits_8_9\n\t"
+#else
+        "B.N	L_sp_256_num_bits_8_9\n\t"
+#endif
         "\n"
-    "L_sp_256_num_bits_8_3_%=:\n\t"
+    "L_sp_256_num_bits_8_3:\n\t"
         "LDR	r1, [%[a], #8]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_num_bits_8_2_%=\n\t"
+        "BEQ	L_sp_256_num_bits_8_2\n\t"
 #else
-        "BEQ.N	L_sp_256_num_bits_8_2_%=\n\t"
+        "BEQ.N	L_sp_256_num_bits_8_2\n\t"
 #endif
         "MOV	r2, #0x60\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_256_num_bits_8_9_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_num_bits_8_9\n\t"
+#else
+        "B.N	L_sp_256_num_bits_8_9\n\t"
+#endif
         "\n"
-    "L_sp_256_num_bits_8_2_%=:\n\t"
+    "L_sp_256_num_bits_8_2:\n\t"
         "LDR	r1, [%[a], #4]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_256_num_bits_8_1_%=\n\t"
+        "BEQ	L_sp_256_num_bits_8_1\n\t"
 #else
-        "BEQ.N	L_sp_256_num_bits_8_1_%=\n\t"
+        "BEQ.N	L_sp_256_num_bits_8_1\n\t"
 #endif
         "MOV	r2, #0x40\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_256_num_bits_8_9_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_256_num_bits_8_9\n\t"
+#else
+        "B.N	L_sp_256_num_bits_8_9\n\t"
+#endif
         "\n"
-    "L_sp_256_num_bits_8_1_%=:\n\t"
+    "L_sp_256_num_bits_8_1:\n\t"
         "LDR	r1, [%[a]]\n\t"
         "MOV	r2, #0x20\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
         "\n"
-    "L_sp_256_num_bits_8_9_%=:\n\t"
+    "L_sp_256_num_bits_8_9:\n\t"
         "MOV	%[a], r4\n\t"
         : [a] "+r" (a)
         :
@@ -40798,7 +40873,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -40854,7 +40929,7 @@ static int sp_256_ecc_is_point_8(const sp_point_256* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -41460,61 +41535,80 @@ static void sp_384_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x60\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_384_mul_12_outer_%=:\n\t"
+    "L_sp_384_mul_12_outer:\n\t"
         "SUBS	r3, r5, #0x2c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_384_mul_12_inner_%=:\n\t"
+    "L_sp_384_mul_12_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x30\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_mul_12_inner_done_%=\n\t"
+        "BGT	L_sp_384_mul_12_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_384_mul_12_inner_done_%=\n\t"
+        "BGT.N	L_sp_384_mul_12_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_384_mul_12_inner_%=\n\t"
+        "BLT	L_sp_384_mul_12_inner\n\t"
 #else
-        "BLE.N	L_sp_384_mul_12_inner_%=\n\t"
+        "BLT.N	L_sp_384_mul_12_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_384_mul_12_inner_done_%=:\n\t"
+    "L_sp_384_mul_12_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x58\n\t"
+        "CMP	r5, #0x54\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_384_mul_12_outer_%=\n\t"
+        "BLE	L_sp_384_mul_12_outer\n\t"
 #else
-        "BLE.N	L_sp_384_mul_12_outer_%=\n\t"
+        "BLE.N	L_sp_384_mul_12_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #44]\n\t"
+        "LDR	r11, [%[b], #44]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_384_mul_12_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_384_mul_12_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_384_mul_12_store_%=\n\t"
+        "BGT	L_sp_384_mul_12_store\n\t"
 #else
-        "BGT.N	L_sp_384_mul_12_store_%=\n\t"
+        "BGT.N	L_sp_384_mul_12_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -42570,24 +42664,20 @@ static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x60\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_384_sqr_12_outer_%=:\n\t"
+    "L_sp_384_sqr_12_outer:\n\t"
         "SUBS	r3, r5, #0x2c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_384_sqr_12_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_sqr_12_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_384_sqr_12_op_sqr_%=\n\t"
-#endif
+    "L_sp_384_sqr_12_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -42597,59 +42687,51 @@ static void sp_384_sqr_12(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_384_sqr_12_op_done_%=\n\t"
-        "\n"
-    "L_sp_384_sqr_12_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_384_sqr_12_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x30\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_sqr_12_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_384_sqr_12_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_384_sqr_12_inner_done_%=\n\t"
+        "BGT	L_sp_384_sqr_12_inner_done\n\t"
 #else
-        "BGT.N	L_sp_384_sqr_12_inner_done_%=\n\t"
+        "BGT.N	L_sp_384_sqr_12_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_384_sqr_12_inner_%=\n\t"
+        "BLT	L_sp_384_sqr_12_inner\n\t"
 #else
-        "BLE.N	L_sp_384_sqr_12_inner_%=\n\t"
+        "BLT.N	L_sp_384_sqr_12_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_384_sqr_12_inner_done_%=:\n\t"
+    "L_sp_384_sqr_12_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x58\n\t"
+        "CMP	r5, #0x54\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_384_sqr_12_outer_%=\n\t"
+        "BLE	L_sp_384_sqr_12_outer\n\t"
 #else
-        "BLE.N	L_sp_384_sqr_12_outer_%=\n\t"
+        "BLE.N	L_sp_384_sqr_12_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #44]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_384_sqr_12_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_384_sqr_12_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_384_sqr_12_store_%=\n\t"
+        "BGT	L_sp_384_sqr_12_store\n\t"
 #else
-        "BGT.N	L_sp_384_sqr_12_store_%=\n\t"
+        "BGT.N	L_sp_384_sqr_12_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -43382,7 +43464,7 @@ static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x30\n\t"
         "\n"
-    "L_sp_384_add_12_word_%=:\n\t"
+    "L_sp_384_add_12_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -43395,9 +43477,9 @@ static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_384_add_12_word_%=\n\t"
+        "BNE	L_sp_384_add_12_word\n\t"
 #else
-        "BNE.N	L_sp_384_add_12_word_%=\n\t"
+        "BNE.N	L_sp_384_add_12_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -43782,7 +43864,7 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a, const sp_digi
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_384_cond_sub_12_words_%=:\n\t"
+    "L_sp_384_cond_sub_12_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -43793,9 +43875,9 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a, const sp_digi
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x30\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_384_cond_sub_12_words_%=\n\t"
+        "BLT	L_sp_384_cond_sub_12_words\n\t"
 #else
-        "BLT.N	L_sp_384_cond_sub_12_words_%=\n\t"
+        "BLT.N	L_sp_384_cond_sub_12_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -43909,7 +43991,7 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_384_mont_reduce_12_word_%=:\n\t"
+    "L_sp_384_mont_reduce_12_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -44012,9 +44094,9 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x30\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_384_mont_reduce_12_word_%=\n\t"
+        "BLT	L_sp_384_mont_reduce_12_word\n\t"
 #else
-        "BLT.W	L_sp_384_mont_reduce_12_word_%=\n\t"
+        "BLT.W	L_sp_384_mont_reduce_12_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -44056,7 +44138,7 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_384_mont_reduce_12_word_%=:\n\t"
+    "L_sp_384_mont_reduce_12_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -44124,9 +44206,9 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x30\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_384_mont_reduce_12_word_%=\n\t"
+        "BLT	L_sp_384_mont_reduce_12_word\n\t"
 #else
-        "BLT.W	L_sp_384_mont_reduce_12_word_%=\n\t"
+        "BLT.W	L_sp_384_mont_reduce_12_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -44182,8 +44264,8 @@ SP_NOINLINE static void sp_384_mont_sqr_12(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_12(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_12(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_12(r, a, m, mp);
     for (; n > 1; n--) {
@@ -44311,7 +44393,7 @@ static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0x2c\n\t"
         "\n"
-    "L_sp_384_cmp_12_words_%=:\n\t"
+    "L_sp_384_cmp_12_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -44324,7 +44406,7 @@ static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_384_cmp_12_words_%=\n\t"
+        "bcs	L_sp_384_cmp_12_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #44]\n\t"
@@ -44614,7 +44696,7 @@ static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "MOV	r11, #0x0\n\t"
         "ADD	r12, %[a], #0x30\n\t"
         "\n"
-    "L_sp_384_sub_12_word_%=:\n\t"
+    "L_sp_384_sub_12_word:\n\t"
         "RSBS	r11, r11, #0x0\n\t"
         "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
         "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -44626,9 +44708,9 @@ static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "SBC	r11, r3, r3\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_384_sub_12_word_%=\n\t"
+        "BNE	L_sp_384_sub_12_word\n\t"
 #else
-        "BNE.N	L_sp_384_sub_12_word_%=\n\t"
+        "BNE.N	L_sp_384_sub_12_word\n\t"
 #endif
         "MOV	%[r], r11\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -44715,7 +44797,7 @@ static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digi
         "MOV	r8, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_384_cond_add_12_words_%=:\n\t"
+    "L_sp_384_cond_add_12_words:\n\t"
         "ADDS	r5, r5, #0xffffffff\n\t"
         "LDR	r6, [%[a], r4]\n\t"
         "LDR	r7, [%[b], r4]\n\t"
@@ -44726,9 +44808,9 @@ static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digi
         "ADD	r4, r4, #0x4\n\t"
         "CMP	r4, #0x30\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_384_cond_add_12_words_%=\n\t"
+        "BLT	L_sp_384_cond_add_12_words\n\t"
 #else
-        "BLT.N	L_sp_384_cond_add_12_words_%=\n\t"
+        "BLT.N	L_sp_384_cond_add_12_words\n\t"
 #endif
         "MOV	%[r], r5\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -48912,7 +48994,7 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x30\n\t"
         "\n"
-    "L_sp_384_sub_in_pkace_12_word_%=:\n\t"
+    "L_sp_384_sub_in_pkace_12_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -48924,9 +49006,9 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_384_sub_in_pkace_12_word_%=\n\t"
+        "BNE	L_sp_384_sub_in_pkace_12_word\n\t"
 #else
-        "BNE.N	L_sp_384_sub_in_pkace_12_word_%=\n\t"
+        "BNE.N	L_sp_384_sub_in_pkace_12_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -49012,7 +49094,7 @@ static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_384_mul_d_12_word_%=:\n\t"
+    "L_sp_384_mul_d_12_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -49026,9 +49108,9 @@ static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x30\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_384_mul_d_12_word_%=\n\t"
+        "BLT	L_sp_384_mul_d_12_word\n\t"
 #else
-        "BLT.N	L_sp_384_mul_d_12_word_%=\n\t"
+        "BLT.N	L_sp_384_mul_d_12_word\n\t"
 #endif
         "STR	r3, [%[r], #48]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -49135,9 +49217,9 @@ static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, sp_digit b)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -49200,9 +49282,9 @@ static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -49226,7 +49308,7 @@ static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_384_word_12_bit_%=:\n\t"
+    "L_div_384_word_12_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -49236,7 +49318,7 @@ static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_384_word_12_bit_%=\n\t"
+        "bpl	L_div_384_word_12_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -49900,9 +49982,9 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m
         "LDM	%[a]!, {r4}\n\t"
         "ANDS	r3, r4, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_div2_mod_12_even_%=\n\t"
+        "BEQ	L_sp_384_div2_mod_12_even\n\t"
 #else
-        "BEQ.N	L_sp_384_div2_mod_12_even_%=\n\t"
+        "BEQ.N	L_sp_384_div2_mod_12_even\n\t"
 #endif
         "MOV	r12, #0x0\n\t"
         "LDM	%[a]!, {r5, r6, r7}\n\t"
@@ -49927,9 +50009,13 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m
         "ADCS	r7, r7, r11\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
         "ADC	r3, r12, r12\n\t"
-        "B	L_sp_384_div2_mod_12_div2_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_div2_mod_12_div2\n\t"
+#else
+        "B.N	L_sp_384_div2_mod_12_div2\n\t"
+#endif
         "\n"
-    "L_sp_384_div2_mod_12_even_%=:\n\t"
+    "L_sp_384_div2_mod_12_even:\n\t"
         "LDM	%[a]!, {r5, r6, r7}\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
@@ -49937,7 +50023,7 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
         "\n"
-    "L_sp_384_div2_mod_12_div2_%=:\n\t"
+    "L_sp_384_div2_mod_12_div2:\n\t"
         "SUB	%[r], %[r], #0x30\n\t"
         "LDRD	r8, r9, [%[r]]\n\t"
         "LSR	r8, r8, #1\n\t"
@@ -50006,152 +50092,196 @@ static int sp_384_num_bits_12(const sp_digit* a)
         "LDR	r1, [%[a], #44]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_11_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_11\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_11_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_11\n\t"
 #endif
         "MOV	r2, #0x180\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_11_%=:\n\t"
+    "L_sp_384_num_bits_12_11:\n\t"
         "LDR	r1, [%[a], #40]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_10_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_10\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_10_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_10\n\t"
 #endif
         "MOV	r2, #0x160\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_10_%=:\n\t"
+    "L_sp_384_num_bits_12_10:\n\t"
         "LDR	r1, [%[a], #36]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_9_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_9\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_9_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_9\n\t"
 #endif
         "MOV	r2, #0x140\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_9_%=:\n\t"
+    "L_sp_384_num_bits_12_9:\n\t"
         "LDR	r1, [%[a], #32]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_8_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_8\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_8_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_8\n\t"
 #endif
         "MOV	r2, #0x120\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_8_%=:\n\t"
+    "L_sp_384_num_bits_12_8:\n\t"
         "LDR	r1, [%[a], #28]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_7_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_7\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_7_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_7\n\t"
 #endif
         "MOV	r2, #0x100\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_7_%=:\n\t"
+    "L_sp_384_num_bits_12_7:\n\t"
         "LDR	r1, [%[a], #24]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_6_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_6\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_6_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_6\n\t"
 #endif
         "MOV	r2, #0xe0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_6_%=:\n\t"
+    "L_sp_384_num_bits_12_6:\n\t"
         "LDR	r1, [%[a], #20]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_5_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_5\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_5_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_5\n\t"
 #endif
         "MOV	r2, #0xc0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_5_%=:\n\t"
+    "L_sp_384_num_bits_12_5:\n\t"
         "LDR	r1, [%[a], #16]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_4_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_4\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_4_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_4\n\t"
 #endif
         "MOV	r2, #0xa0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_4_%=:\n\t"
+    "L_sp_384_num_bits_12_4:\n\t"
         "LDR	r1, [%[a], #12]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_3_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_3\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_3_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_3\n\t"
 #endif
         "MOV	r2, #0x80\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_3_%=:\n\t"
+    "L_sp_384_num_bits_12_3:\n\t"
         "LDR	r1, [%[a], #8]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_2_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_2\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_2_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_2\n\t"
 #endif
         "MOV	r2, #0x60\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_2_%=:\n\t"
+    "L_sp_384_num_bits_12_2:\n\t"
         "LDR	r1, [%[a], #4]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_384_num_bits_12_1_%=\n\t"
+        "BEQ	L_sp_384_num_bits_12_1\n\t"
 #else
-        "BEQ.N	L_sp_384_num_bits_12_1_%=\n\t"
+        "BEQ.N	L_sp_384_num_bits_12_1\n\t"
 #endif
         "MOV	r2, #0x40\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_384_num_bits_12_13_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_384_num_bits_12_13\n\t"
+#else
+        "B.N	L_sp_384_num_bits_12_13\n\t"
+#endif
         "\n"
-    "L_sp_384_num_bits_12_1_%=:\n\t"
+    "L_sp_384_num_bits_12_1:\n\t"
         "LDR	r1, [%[a]]\n\t"
         "MOV	r2, #0x20\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
         "\n"
-    "L_sp_384_num_bits_12_13_%=:\n\t"
+    "L_sp_384_num_bits_12_13:\n\t"
         "MOV	%[a], r4\n\t"
         : [a] "+r" (a)
         :
@@ -50608,7 +50738,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -50664,7 +50794,7 @@ static int sp_384_ecc_is_point_12(const sp_point_384* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -51312,64 +51442,83 @@ static void sp_521_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x88\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_521_mul_17_outer_%=:\n\t"
+    "L_sp_521_mul_17_outer:\n\t"
         "SUBS	r3, r5, #0x40\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_521_mul_17_inner_%=:\n\t"
+    "L_sp_521_mul_17_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x44\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_mul_17_inner_done_%=\n\t"
+        "BGT	L_sp_521_mul_17_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_521_mul_17_inner_done_%=\n\t"
+        "BGT.N	L_sp_521_mul_17_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_521_mul_17_inner_%=\n\t"
+        "BLT	L_sp_521_mul_17_inner\n\t"
 #else
-        "BLE.N	L_sp_521_mul_17_inner_%=\n\t"
+        "BLT.N	L_sp_521_mul_17_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_521_mul_17_inner_done_%=:\n\t"
+    "L_sp_521_mul_17_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x80\n\t"
+        "CMP	r5, #0x7c\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_521_mul_17_outer_%=\n\t"
+        "BLE	L_sp_521_mul_17_outer\n\t"
 #else
-        "BLE.N	L_sp_521_mul_17_outer_%=\n\t"
+        "BLE.N	L_sp_521_mul_17_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #64]\n\t"
+        "LDR	r11, [%[b], #64]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "LDM	sp!, {r6, r7}\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SUB	r5, r5, #0x8\n\t"
         "\n"
-    "L_sp_521_mul_17_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_521_mul_17_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_521_mul_17_store_%=\n\t"
+        "BGT	L_sp_521_mul_17_store\n\t"
 #else
-        "BGT.N	L_sp_521_mul_17_store_%=\n\t"
+        "BGT.N	L_sp_521_mul_17_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -53439,24 +53588,20 @@ static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x88\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_521_sqr_17_outer_%=:\n\t"
+    "L_sp_521_sqr_17_outer:\n\t"
         "SUBS	r3, r5, #0x40\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_521_sqr_17_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_sqr_17_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_521_sqr_17_op_sqr_%=\n\t"
-#endif
+    "L_sp_521_sqr_17_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -53466,62 +53611,54 @@ static void sp_521_sqr_17(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_521_sqr_17_op_done_%=\n\t"
-        "\n"
-    "L_sp_521_sqr_17_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_521_sqr_17_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x44\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_sqr_17_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_521_sqr_17_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_521_sqr_17_inner_done_%=\n\t"
+        "BGT	L_sp_521_sqr_17_inner_done\n\t"
 #else
-        "BGT.N	L_sp_521_sqr_17_inner_done_%=\n\t"
+        "BGT.N	L_sp_521_sqr_17_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_521_sqr_17_inner_%=\n\t"
+        "BLT	L_sp_521_sqr_17_inner\n\t"
 #else
-        "BLE.N	L_sp_521_sqr_17_inner_%=\n\t"
+        "BLT.N	L_sp_521_sqr_17_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_521_sqr_17_inner_done_%=:\n\t"
+    "L_sp_521_sqr_17_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0x80\n\t"
+        "CMP	r5, #0x7c\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_521_sqr_17_outer_%=\n\t"
+        "BLE	L_sp_521_sqr_17_outer\n\t"
 #else
-        "BLE.N	L_sp_521_sqr_17_outer_%=\n\t"
+        "BLE.N	L_sp_521_sqr_17_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #64]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "LDM	sp!, {r6, r7}\n\t"
         "STM	%[r]!, {r6, r7}\n\t"
         "SUB	r5, r5, #0x8\n\t"
         "\n"
-    "L_sp_521_sqr_17_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_521_sqr_17_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_521_sqr_17_store_%=\n\t"
+        "BGT	L_sp_521_sqr_17_store\n\t"
 #else
-        "BGT.N	L_sp_521_sqr_17_store_%=\n\t"
+        "BGT.N	L_sp_521_sqr_17_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -54838,7 +54975,7 @@ static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x40\n\t"
         "\n"
-    "L_sp_521_add_17_word_%=:\n\t"
+    "L_sp_521_add_17_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -54851,9 +54988,9 @@ static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_521_add_17_word_%=\n\t"
+        "BNE	L_sp_521_add_17_word\n\t"
 #else
-        "BNE.N	L_sp_521_add_17_word_%=\n\t"
+        "BNE.N	L_sp_521_add_17_word\n\t"
 #endif
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a], {r4}\n\t"
@@ -55171,7 +55308,7 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a, const sp_digi
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_521_cond_sub_17_words_%=:\n\t"
+    "L_sp_521_cond_sub_17_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -55182,9 +55319,9 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a, const sp_digi
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x44\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_521_cond_sub_17_words_%=\n\t"
+        "BLT	L_sp_521_cond_sub_17_words\n\t"
 #else
-        "BLT.N	L_sp_521_cond_sub_17_words_%=\n\t"
+        "BLT.N	L_sp_521_cond_sub_17_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -55451,19 +55588,19 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_word_%=:\n\t"
+    "L_sp_521_mont_reduce_order_17_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         "CMP	r11, #0x40\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_521_mont_reduce_order_17_nomask_%=\n\t"
+        "BNE	L_sp_521_mont_reduce_order_17_nomask\n\t"
 #else
-        "BNE.N	L_sp_521_mont_reduce_order_17_nomask_%=\n\t"
+        "BNE.N	L_sp_521_mont_reduce_order_17_nomask\n\t"
 #endif
         "MOV	r9, #0x1ff\n\t"
         "AND	r10, r10, r9\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t"
+    "L_sp_521_mont_reduce_order_17_nomask:\n\t"
         /* a[i+0] += m[0] * mu */
         "MOV	r7, #0x0\n\t"
         "UMLAL	r4, r7, r10, lr\n\t"
@@ -55605,9 +55742,9 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x44\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_521_mont_reduce_order_17_word_%=\n\t"
+        "BLT	L_sp_521_mont_reduce_order_17_word\n\t"
 #else
-        "BLT.W	L_sp_521_mont_reduce_order_17_word_%=\n\t"
+        "BLT.W	L_sp_521_mont_reduce_order_17_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -55719,19 +55856,19 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_word_%=:\n\t"
+    "L_sp_521_mont_reduce_order_17_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         "CMP	r4, #0x40\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_521_mont_reduce_order_17_nomask_%=\n\t"
+        "BNE	L_sp_521_mont_reduce_order_17_nomask\n\t"
 #else
-        "BNE.N	L_sp_521_mont_reduce_order_17_nomask_%=\n\t"
+        "BNE.N	L_sp_521_mont_reduce_order_17_nomask\n\t"
 #endif
         "MOV	r12, #0x1ff\n\t"
         "AND	lr, lr, r12\n\t"
         "\n"
-    "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t"
+    "L_sp_521_mont_reduce_order_17_nomask:\n\t"
         /* a[i+0] += m[0] * mu */
         "LDR	r12, [%[m]]\n\t"
         "MOV	r3, #0x0\n\t"
@@ -55823,9 +55960,9 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x44\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_521_mont_reduce_order_17_word_%=\n\t"
+        "BLT	L_sp_521_mont_reduce_order_17_word\n\t"
 #else
-        "BLT.W	L_sp_521_mont_reduce_order_17_word_%=\n\t"
+        "BLT.W	L_sp_521_mont_reduce_order_17_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -55951,8 +56088,8 @@ SP_NOINLINE static void sp_521_mont_sqr_17(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_17(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_17(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_17(r, a, m, mp);
     for (; n > 1; n--) {
@@ -56077,7 +56214,7 @@ static sp_int32 sp_521_cmp_17(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0x40\n\t"
         "\n"
-    "L_sp_521_cmp_17_words_%=:\n\t"
+    "L_sp_521_cmp_17_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -56090,7 +56227,7 @@ static sp_int32 sp_521_cmp_17(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_521_cmp_17_words_%=\n\t"
+        "bcs	L_sp_521_cmp_17_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #64]\n\t"
@@ -61870,7 +62007,7 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x40\n\t"
         "\n"
-    "L_sp_521_sub_in_pkace_17_word_%=:\n\t"
+    "L_sp_521_sub_in_pkace_17_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -61882,9 +62019,9 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_521_sub_in_pkace_17_word_%=\n\t"
+        "BNE	L_sp_521_sub_in_pkace_17_word\n\t"
 #else
-        "BNE.N	L_sp_521_sub_in_pkace_17_word_%=\n\t"
+        "BNE.N	L_sp_521_sub_in_pkace_17_word\n\t"
 #endif
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2}\n\t"
@@ -61986,7 +62123,7 @@ static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_521_mul_d_17_word_%=:\n\t"
+    "L_sp_521_mul_d_17_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -62000,9 +62137,9 @@ static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x44\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_521_mul_d_17_word_%=\n\t"
+        "BLT	L_sp_521_mul_d_17_word\n\t"
 #else
-        "BLT.N	L_sp_521_mul_d_17_word_%=\n\t"
+        "BLT.N	L_sp_521_mul_d_17_word\n\t"
 #endif
         "STR	r3, [%[r], #68]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -62134,9 +62271,9 @@ static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, sp_digit b)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -62199,9 +62336,9 @@ static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -62225,7 +62362,7 @@ static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_521_word_17_bit_%=:\n\t"
+    "L_div_521_word_17_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -62235,7 +62372,7 @@ static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_521_word_17_bit_%=\n\t"
+        "bpl	L_div_521_word_17_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -62930,7 +63067,7 @@ static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "MOV	r11, #0x0\n\t"
         "ADD	r12, %[a], #0x40\n\t"
         "\n"
-    "L_sp_521_sub_17_word_%=:\n\t"
+    "L_sp_521_sub_17_word:\n\t"
         "RSBS	r11, r11, #0x0\n\t"
         "LDM	%[a]!, {r3, r4, r5, r6}\n\t"
         "LDM	%[b]!, {r7, r8, r9, r10}\n\t"
@@ -62942,9 +63079,9 @@ static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a, const sp_digit* b)
         "SBC	r11, r3, r3\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_521_sub_17_word_%=\n\t"
+        "BNE	L_sp_521_sub_17_word\n\t"
 #else
-        "BNE.N	L_sp_521_sub_17_word_%=\n\t"
+        "BNE.N	L_sp_521_sub_17_word\n\t"
 #endif
         "RSBS	r11, r11, #0x0\n\t"
         "LDM	%[a]!, {r3}\n\t"
@@ -63042,9 +63179,9 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m
         "LDM	%[a]!, {r4}\n\t"
         "ANDS	r3, r4, #0x1\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_div2_mod_17_even_%=\n\t"
+        "BEQ	L_sp_521_div2_mod_17_even\n\t"
 #else
-        "BEQ.N	L_sp_521_div2_mod_17_even_%=\n\t"
+        "BEQ.N	L_sp_521_div2_mod_17_even\n\t"
 #endif
         "MOV	r12, #0x0\n\t"
         "LDM	%[a]!, {r5, r6, r7}\n\t"
@@ -63080,9 +63217,13 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m
         "ADCS	r4, r4, r8\n\t"
         "STM	%[r]!, {r4}\n\t"
         "ADC	r3, r12, r12\n\t"
-        "B	L_sp_521_div2_mod_17_div2_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_div2_mod_17_div2\n\t"
+#else
+        "B.N	L_sp_521_div2_mod_17_div2\n\t"
+#endif
         "\n"
-    "L_sp_521_div2_mod_17_even_%=:\n\t"
+    "L_sp_521_div2_mod_17_even:\n\t"
         "LDM	%[a]!, {r5, r6, r7}\n\t"
         "STM	%[r]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
@@ -63094,7 +63235,7 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m
         "LDM	%[a]!, {r4}\n\t"
         "STM	%[r]!, {r4}\n\t"
         "\n"
-    "L_sp_521_div2_mod_17_div2_%=:\n\t"
+    "L_sp_521_div2_mod_17_div2:\n\t"
         "SUB	%[r], %[r], #0x44\n\t"
         "LDRD	r8, r9, [%[r]]\n\t"
         "LSR	r8, r8, #1\n\t"
@@ -63183,217 +63324,281 @@ static int sp_521_num_bits_17(const sp_digit* a)
         "LDR	r1, [%[a], #64]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_16_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_16\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_16_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_16\n\t"
 #endif
         "MOV	r2, #0x220\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_16_%=:\n\t"
+    "L_sp_521_num_bits_17_16:\n\t"
         "LDR	r1, [%[a], #60]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_15_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_15\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_15_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_15\n\t"
 #endif
         "MOV	r2, #0x200\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_15_%=:\n\t"
+    "L_sp_521_num_bits_17_15:\n\t"
         "LDR	r1, [%[a], #56]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_14_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_14\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_14_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_14\n\t"
 #endif
         "MOV	r2, #0x1e0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_14_%=:\n\t"
+    "L_sp_521_num_bits_17_14:\n\t"
         "LDR	r1, [%[a], #52]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_13_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_13\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_13_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_13\n\t"
 #endif
         "MOV	r2, #0x1c0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_13_%=:\n\t"
+    "L_sp_521_num_bits_17_13:\n\t"
         "LDR	r1, [%[a], #48]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_12_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_12\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_12_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_12\n\t"
 #endif
         "MOV	r2, #0x1a0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_12_%=:\n\t"
+    "L_sp_521_num_bits_17_12:\n\t"
         "LDR	r1, [%[a], #44]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_11_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_11\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_11_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_11\n\t"
 #endif
         "MOV	r2, #0x180\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_11_%=:\n\t"
+    "L_sp_521_num_bits_17_11:\n\t"
         "LDR	r1, [%[a], #40]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_10_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_10\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_10_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_10\n\t"
 #endif
         "MOV	r2, #0x160\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_10_%=:\n\t"
+    "L_sp_521_num_bits_17_10:\n\t"
         "LDR	r1, [%[a], #36]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_9_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_9\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_9_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_9\n\t"
 #endif
         "MOV	r2, #0x140\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_9_%=:\n\t"
+    "L_sp_521_num_bits_17_9:\n\t"
         "LDR	r1, [%[a], #32]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_8_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_8\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_8_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_8\n\t"
 #endif
         "MOV	r2, #0x120\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_8_%=:\n\t"
+    "L_sp_521_num_bits_17_8:\n\t"
         "LDR	r1, [%[a], #28]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_7_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_7\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_7_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_7\n\t"
 #endif
         "MOV	r2, #0x100\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_7_%=:\n\t"
+    "L_sp_521_num_bits_17_7:\n\t"
         "LDR	r1, [%[a], #24]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_6_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_6\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_6_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_6\n\t"
 #endif
         "MOV	r2, #0xe0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_6_%=:\n\t"
+    "L_sp_521_num_bits_17_6:\n\t"
         "LDR	r1, [%[a], #20]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_5_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_5\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_5_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_5\n\t"
 #endif
         "MOV	r2, #0xc0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_5_%=:\n\t"
+    "L_sp_521_num_bits_17_5:\n\t"
         "LDR	r1, [%[a], #16]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_4_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_4\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_4_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_4\n\t"
 #endif
         "MOV	r2, #0xa0\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_4_%=:\n\t"
+    "L_sp_521_num_bits_17_4:\n\t"
         "LDR	r1, [%[a], #12]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_3_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_3\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_3_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_3\n\t"
 #endif
         "MOV	r2, #0x80\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_3_%=:\n\t"
+    "L_sp_521_num_bits_17_3:\n\t"
         "LDR	r1, [%[a], #8]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_2_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_2\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_2_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_2\n\t"
 #endif
         "MOV	r2, #0x60\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_2_%=:\n\t"
+    "L_sp_521_num_bits_17_2:\n\t"
         "LDR	r1, [%[a], #4]\n\t"
         "CMP	r1, #0x0\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_521_num_bits_17_1_%=\n\t"
+        "BEQ	L_sp_521_num_bits_17_1\n\t"
 #else
-        "BEQ.N	L_sp_521_num_bits_17_1_%=\n\t"
+        "BEQ.N	L_sp_521_num_bits_17_1\n\t"
 #endif
         "MOV	r2, #0x40\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
-        "B	L_sp_521_num_bits_17_18_%=\n\t"
+#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
+        "B	L_sp_521_num_bits_17_18\n\t"
+#else
+        "B.N	L_sp_521_num_bits_17_18\n\t"
+#endif
         "\n"
-    "L_sp_521_num_bits_17_1_%=:\n\t"
+    "L_sp_521_num_bits_17_1:\n\t"
         "LDR	r1, [%[a]]\n\t"
         "MOV	r2, #0x20\n\t"
         "CLZ	r4, r1\n\t"
         "SUB	r4, r2, r4\n\t"
         "\n"
-    "L_sp_521_num_bits_17_18_%=:\n\t"
+    "L_sp_521_num_bits_17_18:\n\t"
         "MOV	%[a], r4\n\t"
         : [a] "+r" (a)
         :
@@ -63862,7 +64067,7 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -63918,7 +64123,7 @@ static int sp_521_ecc_is_point_17(const sp_point_521* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -67780,61 +67985,80 @@ static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x100\n\t"
-        "MOV	r5, #0x0\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "LDR	r11, [%[b]]\n\t"
+        "UMULL	r8, r6, lr, r11\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_1024_mul_32_outer_%=:\n\t"
+    "L_sp_1024_mul_32_outer:\n\t"
         "SUBS	r3, r5, #0x7c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_1024_mul_32_inner_%=:\n\t"
+    "L_sp_1024_mul_32_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[b], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
+        "LDR	lr, [%[a], r4]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x80\n\t"
+        "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_1024_mul_32_inner_done_%=\n\t"
+        "BGT	L_sp_1024_mul_32_inner_done\n\t"
 #else
-        "BEQ.N	L_sp_1024_mul_32_inner_done_%=\n\t"
+        "BGT.N	L_sp_1024_mul_32_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_1024_mul_32_inner_%=\n\t"
+        "BLT	L_sp_1024_mul_32_inner\n\t"
 #else
-        "BLE.N	L_sp_1024_mul_32_inner_%=\n\t"
+        "BLT.N	L_sp_1024_mul_32_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "LDR	r11, [%[b], r3]\n\t"
+        "UMULL	r9, r10, lr, r11\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_1024_mul_32_inner_done_%=:\n\t"
+    "L_sp_1024_mul_32_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0xf8\n\t"
+        "CMP	r5, #0xf4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_1024_mul_32_outer_%=\n\t"
+        "BLE	L_sp_1024_mul_32_outer\n\t"
 #else
-        "BLE.N	L_sp_1024_mul_32_outer_%=\n\t"
+        "BLE.N	L_sp_1024_mul_32_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #124]\n\t"
+        "LDR	r11, [%[b], #124]\n\t"
+        "UMLAL	r6, r7, lr, r11\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_1024_mul_32_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_1024_mul_32_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_1024_mul_32_store_%=\n\t"
+        "BGT	L_sp_1024_mul_32_store\n\t"
 #else
-        "BGT.N	L_sp_1024_mul_32_store_%=\n\t"
+        "BGT.N	L_sp_1024_mul_32_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
         :
@@ -67860,24 +68084,20 @@ static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
 
     __asm__ __volatile__ (
         "SUB	sp, sp, #0x100\n\t"
-        "MOV	r6, #0x0\n\t"
+        "LDR	lr, [%[a]]\n\t"
+        "UMULL	r8, r6, lr, lr\n\t"
+        "STR	r8, [sp]\n\t"
         "MOV	r7, #0x0\n\t"
         "MOV	r8, #0x0\n\t"
-        "MOV	r5, #0x0\n\t"
+        "MOV	r5, #0x4\n\t"
         "\n"
-    "L_sp_1024_sqr_32_outer_%=:\n\t"
+    "L_sp_1024_sqr_32_outer:\n\t"
         "SUBS	r3, r5, #0x7c\n\t"
         "IT	cc\n\t"
-        "movcc	r3, #0\n\t"
+        "MOVCC	r3, #0x0\n\t"
         "SUB	r4, r5, r3\n\t"
         "\n"
-    "L_sp_1024_sqr_32_inner_%=:\n\t"
-        "CMP	r4, r3\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_1024_sqr_32_op_sqr_%=\n\t"
-#else
-        "BEQ.N	L_sp_1024_sqr_32_op_sqr_%=\n\t"
-#endif
+    "L_sp_1024_sqr_32_inner:\n\t"
         "LDR	lr, [%[a], r3]\n\t"
         "LDR	r11, [%[a], r4]\n\t"
         "UMULL	r9, r10, lr, r11\n\t"
@@ -67887,59 +68107,51 @@ static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a)
         "ADDS	r6, r6, r9\n\t"
         "ADCS	r7, r7, r10\n\t"
         "ADC	r8, r8, #0x0\n\t"
-        "bal	L_sp_1024_sqr_32_op_done_%=\n\t"
-        "\n"
-    "L_sp_1024_sqr_32_op_sqr_%=:\n\t"
-        "LDR	lr, [%[a], r3]\n\t"
-        "UMULL	r9, r10, lr, lr\n\t"
-        "ADDS	r6, r6, r9\n\t"
-        "ADCS	r7, r7, r10\n\t"
-        "ADC	r8, r8, #0x0\n\t"
-        "\n"
-    "L_sp_1024_sqr_32_op_done_%=:\n\t"
         "ADD	r3, r3, #0x4\n\t"
         "SUB	r4, r4, #0x4\n\t"
-        "CMP	r3, #0x80\n\t"
-#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BEQ	L_sp_1024_sqr_32_inner_done_%=\n\t"
-#else
-        "BEQ.N	L_sp_1024_sqr_32_inner_done_%=\n\t"
-#endif
         "CMP	r3, r4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_1024_sqr_32_inner_done_%=\n\t"
+        "BGT	L_sp_1024_sqr_32_inner_done\n\t"
 #else
-        "BGT.N	L_sp_1024_sqr_32_inner_done_%=\n\t"
+        "BGT.N	L_sp_1024_sqr_32_inner_done\n\t"
 #endif
-        "CMP	r3, r5\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_1024_sqr_32_inner_%=\n\t"
+        "BLT	L_sp_1024_sqr_32_inner\n\t"
 #else
-        "BLE.N	L_sp_1024_sqr_32_inner_%=\n\t"
+        "BLT.N	L_sp_1024_sqr_32_inner\n\t"
 #endif
+        "LDR	lr, [%[a], r3]\n\t"
+        "UMULL	r9, r10, lr, lr\n\t"
+        "ADDS	r6, r6, r9\n\t"
+        "ADCS	r7, r7, r10\n\t"
+        "ADC	r8, r8, #0x0\n\t"
         "\n"
-    "L_sp_1024_sqr_32_inner_done_%=:\n\t"
+    "L_sp_1024_sqr_32_inner_done:\n\t"
         "STR	r6, [sp, r5]\n\t"
         "MOV	r6, r7\n\t"
         "MOV	r7, r8\n\t"
         "MOV	r8, #0x0\n\t"
         "ADD	r5, r5, #0x4\n\t"
-        "CMP	r5, #0xf8\n\t"
+        "CMP	r5, #0xf4\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLE	L_sp_1024_sqr_32_outer_%=\n\t"
+        "BLE	L_sp_1024_sqr_32_outer\n\t"
 #else
-        "BLE.N	L_sp_1024_sqr_32_outer_%=\n\t"
+        "BLE.N	L_sp_1024_sqr_32_outer\n\t"
 #endif
+        "LDR	lr, [%[a], #124]\n\t"
+        "UMLAL	r6, r7, lr, lr\n\t"
         "STR	r6, [sp, r5]\n\t"
+        "ADD	r5, r5, #0x4\n\t"
+        "STR	r7, [sp, r5]\n\t"
         "\n"
-    "L_sp_1024_sqr_32_store_%=:\n\t"
-        "LDM	sp!, {r6, r7, r8, r9}\n\t"
-        "STM	%[r]!, {r6, r7, r8, r9}\n\t"
-        "SUBS	r5, r5, #0x10\n\t"
+    "L_sp_1024_sqr_32_store:\n\t"
+        "LDM	sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "STM	%[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t"
+        "SUBS	r5, r5, #0x20\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BGT	L_sp_1024_sqr_32_store_%=\n\t"
+        "BGT	L_sp_1024_sqr_32_store\n\t"
 #else
-        "BGT.N	L_sp_1024_sqr_32_store_%=\n\t"
+        "BGT.N	L_sp_1024_sqr_32_store\n\t"
 #endif
         : [r] "+r" (r), [a] "+r" (a)
         :
@@ -68054,7 +68266,7 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a, const sp_digit* b)
         "MOV	r10, #0x0\n\t"
         "ADD	r11, %[a], #0x80\n\t"
         "\n"
-    "L_sp_1024_sub_in_pkace_32_word_%=:\n\t"
+    "L_sp_1024_sub_in_pkace_32_word:\n\t"
         "RSBS	r10, r10, #0x0\n\t"
         "LDM	%[a], {r2, r3, r4, r5}\n\t"
         "LDM	%[b]!, {r6, r7, r8, r9}\n\t"
@@ -68066,9 +68278,9 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a, const sp_digit* b)
         "SBC	r10, r10, r10\n\t"
         "CMP	%[a], r11\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_1024_sub_in_pkace_32_word_%=\n\t"
+        "BNE	L_sp_1024_sub_in_pkace_32_word\n\t"
 #else
-        "BNE.N	L_sp_1024_sub_in_pkace_32_word_%=\n\t"
+        "BNE.N	L_sp_1024_sub_in_pkace_32_word\n\t"
 #endif
         "MOV	%[a], r10\n\t"
         : [a] "+r" (a), [b] "+r" (b)
@@ -68106,7 +68318,7 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r4, #0x0\n\t"
         "MOV	r5, #0x0\n\t"
         "\n"
-    "L_sp_1024_cond_sub_32_words_%=:\n\t"
+    "L_sp_1024_cond_sub_32_words:\n\t"
         "SUBS	r4, r8, r4\n\t"
         "LDR	r6, [%[a], r5]\n\t"
         "LDR	r7, [%[b], r5]\n\t"
@@ -68117,9 +68329,9 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r5, r5, #0x4\n\t"
         "CMP	r5, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_1024_cond_sub_32_words_%=\n\t"
+        "BLT	L_sp_1024_cond_sub_32_words\n\t"
 #else
-        "BLT.N	L_sp_1024_cond_sub_32_words_%=\n\t"
+        "BLT.N	L_sp_1024_cond_sub_32_words\n\t"
 #endif
         "MOV	%[r], r4\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -68297,7 +68509,7 @@ static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b
         "MOV	r3, #0x0\n\t"
         "ADD	r12, %[a], #0x80\n\t"
         "\n"
-    "L_sp_1024_add_32_word_%=:\n\t"
+    "L_sp_1024_add_32_word:\n\t"
         "ADDS	r3, r3, #0xffffffff\n\t"
         "LDM	%[a]!, {r4, r5, r6, r7}\n\t"
         "LDM	%[b]!, {r8, r9, r10, r11}\n\t"
@@ -68310,9 +68522,9 @@ static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b
         "ADC	r3, r4, #0x0\n\t"
         "CMP	%[a], r12\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BNE	L_sp_1024_add_32_word_%=\n\t"
+        "BNE	L_sp_1024_add_32_word\n\t"
 #else
-        "BNE.N	L_sp_1024_add_32_word_%=\n\t"
+        "BNE.N	L_sp_1024_add_32_word\n\t"
 #endif
         "MOV	%[r], r3\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -68351,7 +68563,7 @@ static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
         "MOV	r5, #0x0\n\t"
         "MOV	r9, #0x4\n\t"
         "\n"
-    "L_sp_1024_mul_d_32_word_%=:\n\t"
+    "L_sp_1024_mul_d_32_word:\n\t"
         /* A[i] * B */
         "LDR	r8, [%[a], r9]\n\t"
         "UMULL	r6, r7, %[b], r8\n\t"
@@ -68365,9 +68577,9 @@ static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
         "ADD	r9, r9, #0x4\n\t"
         "CMP	r9, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_1024_mul_d_32_word_%=\n\t"
+        "BLT	L_sp_1024_mul_d_32_word\n\t"
 #else
-        "BLT.N	L_sp_1024_mul_d_32_word_%=\n\t"
+        "BLT.N	L_sp_1024_mul_d_32_word\n\t"
 #endif
         "STR	r3, [%[r], #128]\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
@@ -68574,9 +68786,9 @@ static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -68639,9 +68851,9 @@ static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div)
  * Note that this is an approximate div. It may give an answer 1 larger.
  */
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
-static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
+SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p)
 #else
-static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div)
+SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div)
 #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */
 {
 #ifndef WOLFSSL_NO_VAR_ASSIGN_REG
@@ -68665,7 +68877,7 @@ static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div)
         /* Next 30 bits */
         "MOV	r4, #0x1d\n\t"
         "\n"
-    "L_div_1024_word_32_bit_%=:\n\t"
+    "L_div_1024_word_32_bit:\n\t"
         "LSLS	r6, r6, #1\n\t"
         "ADC	r7, r7, r7\n\t"
         "SUBS	r8, r5, r7\n\t"
@@ -68675,7 +68887,7 @@ static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div)
         "AND	r8, r8, r5\n\t"
         "SUBS	r7, r7, r8\n\t"
         "SUBS	r4, r4, #0x1\n\t"
-        "bpl	L_div_1024_word_32_bit_%=\n\t"
+        "bpl	L_div_1024_word_32_bit\n\t"
         "ADD	r3, r3, r3\n\t"
         "ADD	r3, r3, #0x1\n\t"
         "UMULL	r6, r7, r3, %[div]\n\t"
@@ -68757,7 +68969,7 @@ static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
 #ifdef WOLFSSL_SP_SMALL
         "MOV	r6, #0x7c\n\t"
         "\n"
-    "L_sp_1024_cmp_32_words_%=:\n\t"
+    "L_sp_1024_cmp_32_words:\n\t"
         "LDR	r4, [%[a], r6]\n\t"
         "LDR	r5, [%[b], r6]\n\t"
         "AND	r4, r4, r3\n\t"
@@ -68770,7 +68982,7 @@ static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b)
         "IT	ne\n\t"
         "movne	r3, r7\n\t"
         "SUBS	r6, r6, #0x4\n\t"
-        "bcs	L_sp_1024_cmp_32_words_%=\n\t"
+        "bcs	L_sp_1024_cmp_32_words\n\t"
         "EOR	r2, r2, r3\n\t"
 #else
         "LDR	r4, [%[a], #124]\n\t"
@@ -69490,7 +69702,7 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r4, [%[a]]\n\t"
         "LDR	r5, [%[a], #4]\n\t"
         "\n"
-    "L_sp_1024_mont_reduce_32_word_%=:\n\t"
+    "L_sp_1024_mont_reduce_32_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	r10, %[mp], r4\n\t"
         /* a[i+0] += m[0] * mu */
@@ -69753,9 +69965,9 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r11, #0x80\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_1024_mont_reduce_32_word_%=\n\t"
+        "BLT	L_sp_1024_mont_reduce_32_word\n\t"
 #else
-        "BLT.W	L_sp_1024_mont_reduce_32_word_%=\n\t"
+        "BLT.W	L_sp_1024_mont_reduce_32_word\n\t"
 #endif
         /* Loop Done */
         "STR	r4, [%[a]]\n\t"
@@ -69802,7 +70014,7 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "LDR	r9, [%[a], #12]\n\t"
         "LDR	r10, [%[a], #16]\n\t"
         "\n"
-    "L_sp_1024_mont_reduce_32_word_%=:\n\t"
+    "L_sp_1024_mont_reduce_32_word:\n\t"
         /* mu = a[i] * mp */
         "MUL	lr, %[mp], r6\n\t"
         /* a[i+0] += m[0] * mu */
@@ -69970,9 +70182,9 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
         "ADD	%[a], %[a], #0x4\n\t"
         "CMP	r4, #0x80\n\t"
 #ifdef __GNUC__
-        "BLT	L_sp_1024_mont_reduce_32_word_%=\n\t"
+        "BLT	L_sp_1024_mont_reduce_32_word\n\t"
 #else
-        "BLT.W	L_sp_1024_mont_reduce_32_word_%=\n\t"
+        "BLT.W	L_sp_1024_mont_reduce_32_word\n\t"
 #endif
         /* Loop Done */
         "STR	r6, [%[a]]\n\t"
@@ -70987,7 +71199,7 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig
         "MOV	r8, #0x0\n\t"
         "MOV	r4, #0x0\n\t"
         "\n"
-    "L_sp_1024_cond_add_32_words_%=:\n\t"
+    "L_sp_1024_cond_add_32_words:\n\t"
         "ADDS	r5, r5, #0xffffffff\n\t"
         "LDR	r6, [%[a], r4]\n\t"
         "LDR	r7, [%[b], r4]\n\t"
@@ -70998,9 +71210,9 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig
         "ADD	r4, r4, #0x4\n\t"
         "CMP	r4, #0x80\n\t"
 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__)
-        "BLT	L_sp_1024_cond_add_32_words_%=\n\t"
+        "BLT	L_sp_1024_cond_add_32_words\n\t"
 #else
-        "BLT.N	L_sp_1024_cond_add_32_words_%=\n\t"
+        "BLT.N	L_sp_1024_cond_add_32_words\n\t"
 #endif
         "MOV	%[r], r5\n\t"
         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
@@ -80285,7 +80497,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
     }
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -80345,7 +80557,7 @@ static int sp_1024_ecc_is_point_32(const sp_point_1024* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c
index 6dd4e6a8bc..06c01ab005 100644
--- a/wolfcrypt/src/sp_int.c
+++ b/wolfcrypt/src/sp_int.c
@@ -6346,7 +6346,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
         if (r > SP_HALF_MAX) {
             r = SP_HALF_MAX;
         }
-        /* Shift up result for trial division calucation. */
+        /* Shift up result for trial division calculation. */
         r <<= SP_HALF_SIZE;
         /* Calculate trial value. */
         trial = r * (sp_int_word)d;
@@ -12416,7 +12416,7 @@ static int _sp_invmod_mont_ct(const sp_int* a, const sp_int* m, sp_int* r,
             int bit = sp_is_bit_set(e, (unsigned int)i);
 
             /* 6.2. j += bit
-             *      Update count of consequitive 1 bits.
+             *      Update count of consecutive 1 bits.
              */
             j += bit;
             /* 6.3. s += 1
@@ -13107,7 +13107,7 @@ static int _sp_exptmod_mont_ex(const sp_int* b, const sp_int* e, int bits,
     DECL_SP_INT_ARRAY(t, m->used * 2 + 1, (1 << 6) + 1);
 
     /* Window bits based on number of pre-calculations versus number of loop
-     * calculcations.
+     * calculations.
      * Exponents for RSA and DH will result in 6-bit windows.
      */
     if (bits > 450) {
diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c
index 916a32fbff..990a999cbb 100644
--- a/wolfcrypt/src/sp_x86_64.c
+++ b/wolfcrypt/src/sp_x86_64.c
@@ -55,6 +55,7 @@
 #ifdef __IAR_SYSTEMS_ICC__
 #define __asm__        asm
 #define __volatile__   volatile
+#define WOLFSSL_NO_VAR_ASSIGN_REG
 #endif /* __IAR_SYSTEMS_ICC__ */
 #ifdef __KEIL__
 #define __asm__        __asm
@@ -8408,8 +8409,8 @@ extern void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const sp_digit* m,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_4(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_4(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_4(r, a, m, mp);
     for (; n > 1; n--) {
@@ -9608,8 +9609,8 @@ extern void sp_256_mont_sqr_avx2_4(sp_digit* r, const sp_digit* a, const sp_digi
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_256_mont_sqr_n_avx2_4(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_256_mont_sqr_n_avx2_4(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_256_mont_sqr_avx2_4(r, a, m, mp);
     for (; n > 1; n--) {
@@ -26391,7 +26392,7 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -26447,7 +26448,7 @@ static int sp_256_ecc_is_point_4(const sp_point_256* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -27534,8 +27535,8 @@ SP_NOINLINE static void sp_384_mont_sqr_6(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_6(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_6(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_6(r, a, m, mp);
     for (; n > 1; n--) {
@@ -28768,8 +28769,8 @@ SP_NOINLINE static void sp_384_mont_sqr_avx2_6(sp_digit* r, const sp_digit* a,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_384_mont_sqr_n_avx2_6(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_384_mont_sqr_n_avx2_6(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_384_mont_sqr_avx2_6(r, a, m, mp);
     for (; n > 1; n--) {
@@ -51338,7 +51339,7 @@ int sp_ecc_verify_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -51394,7 +51395,7 @@ static int sp_384_ecc_is_point_6(const sp_point_384* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -52415,8 +52416,8 @@ extern void sp_521_mont_sqr_9(sp_digit* r, const sp_digit* a, const sp_digit* m,
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_9(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_9(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_9(r, a, m, mp);
     for (; n > 1; n--) {
@@ -53641,8 +53642,8 @@ extern void sp_521_mont_sqr_avx2_9(sp_digit* r, const sp_digit* a, const sp_digi
  * m   Modulus (prime).
  * mp  Montgomery multiplier.
  */
-static void sp_521_mont_sqr_n_avx2_9(sp_digit* r, const sp_digit* a, int n,
-        const sp_digit* m, sp_digit mp)
+SP_NOINLINE static void sp_521_mont_sqr_n_avx2_9(sp_digit* r,
+    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
 {
     sp_521_mont_sqr_avx2_9(r, a, m, mp);
     for (; n > 1; n--) {
@@ -92476,7 +92477,7 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
 #endif /* HAVE_ECC_VERIFY */
 
 #ifdef HAVE_ECC_CHECK_KEY
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -92532,7 +92533,7 @@ static int sp_521_ecc_is_point_9(const sp_point_521* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
@@ -105407,7 +105408,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n)
     }
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * point  EC point.
  * heap   Heap to use if dynamically allocating.
@@ -105467,7 +105468,7 @@ static int sp_1024_ecc_is_point_16(const sp_point_1024* point,
     return err;
 }
 
-/* Check that the x and y oridinates are a valid point on the curve.
+/* Check that the x and y ordinates are a valid point on the curve.
  *
  * pX  X ordinate of EC point.
  * pY  Y ordinate of EC point.
diff --git a/wolfcrypt/user-crypto/src/rsa.c b/wolfcrypt/user-crypto/src/rsa.c
index 0c65ad098a..66357372fb 100644
--- a/wolfcrypt/user-crypto/src/rsa.c
+++ b/wolfcrypt/user-crypto/src/rsa.c
@@ -2042,7 +2042,7 @@ int wc_MakeRsaKey(RsaKey* key, int size, long e, WC_RNG* rng)
 {
     IppStatus ret;
     int scratchSz;
-    int i; /* for trys on calling make key */
+    int i; /* for tries on calling make key */
     int ctxSz;
 
     IppsBigNumState* pSrcPublicExp = NULL;
@@ -2178,7 +2178,7 @@ int wc_MakeRsaKey(RsaKey* key, int size, long e, WC_RNG* rng)
         goto makeKeyEnd;
     }
 
-    /* call IPP to generate keys, if inseficent entropy error call again */
+    /* call IPP to generate keys, if insufficient entropy error call again */
     ret = ippStsInsufficientEntropy;
     while (ret == ippStsInsufficientEntropy) {
         ret = ippsRSA_GenerateKeys(pSrcPublicExp, key->n, key->e,
diff --git a/wolfssl/internal.h b/wolfssl/internal.h
index c4805ee7aa..aa7e89a70a 100644
--- a/wolfssl/internal.h
+++ b/wolfssl/internal.h
@@ -1764,7 +1764,7 @@ enum Misc {
 #ifndef ECDHE_SIZE /* allow this to be overridden at compile-time */
     ECDHE_SIZE          = 32,  /* ECDHE server size defaults to 256 bit */
 #endif
-    MAX_EXPORT_ECC_SZ   = 256, /* Export ANS X9.62 max future size */
+    MAX_EXPORT_ECC_SZ   = 256, /* Export ANSI X9.62 max future size */
     MAX_CURVE_NAME_SZ   = 16,  /* Maximum size of curve name string */
 
     NEW_SA_MAJOR        = 8,   /* Most significant byte used with new sig algos */
@@ -3576,7 +3576,7 @@ struct WOLFSSL_CTX {
     byte        sendVerify:2;     /* for client side (can not be single bit) */
     byte        haveRSA:1;        /* RSA available */
     byte        haveECC:1;        /* ECC available */
-    byte        haveDH:1;         /* server DH parms set by user */
+    byte        haveDH:1;         /* server DH params set by user */
     byte        haveECDSAsig:1;   /* server cert signed w/ ECDSA */
     byte        haveFalconSig:1;  /* server cert signed w/ Falcon */
     byte        haveDilithiumSig:1;/* server cert signed w/ Dilithium */
@@ -4630,7 +4630,7 @@ struct Options {
     word16            usingCompression:1; /* are we using compression */
     word16            haveRSA:1;          /* RSA available */
     word16            haveECC:1;          /* ECC available */
-    word16            haveDH:1;           /* server DH parms set by user */
+    word16            haveDH:1;           /* server DH params set by user */
     word16            haveECDSAsig:1;     /* server ECDSA signed cert */
     word16            haveStaticECC:1;    /* static server ECC private key */
     word16            haveFalconSig:1;    /* server Falcon signed cert */
diff --git a/wolfssl/wolfcrypt/ext_lms.h b/wolfssl/wolfcrypt/ext_lms.h
index 8b8a8f7288..ccdfdcb30e 100644
--- a/wolfssl/wolfcrypt/ext_lms.h
+++ b/wolfssl/wolfcrypt/ext_lms.h
@@ -48,8 +48,8 @@ typedef struct hss_extra_info  hss_extra_info;
 
 struct LmsKey {
     unsigned             levels;                      /* Number of tree levels. */
-    param_set_t          lm_type[MAX_HSS_LEVELS];     /* Height parm per level. */
-    param_set_t          lm_ots_type[MAX_HSS_LEVELS]; /* Winternitz parm per level. */
+    param_set_t          lm_type[MAX_HSS_LEVELS];     /* Height param per level. */
+    param_set_t          lm_ots_type[MAX_HSS_LEVELS]; /* Winternitz param per level. */
     unsigned char        pub[HSS_MAX_PUBLIC_KEY_LEN];
 #ifndef WOLFSSL_LMS_VERIFY_ONLY
     hss_working_key *    working_key;
diff --git a/wolfssl/wolfcrypt/lms.h b/wolfssl/wolfcrypt/lms.h
index d3ab07571d..483f349c1e 100644
--- a/wolfssl/wolfcrypt/lms.h
+++ b/wolfssl/wolfcrypt/lms.h
@@ -94,8 +94,8 @@ enum wc_LmsParm {
 /* enum wc_LmsState is to help track the state of an LMS/HSS Key. */
 enum wc_LmsState {
     WC_LMS_STATE_FREED,      /* Key has been freed from memory. */
-    WC_LMS_STATE_INITED,     /* Key has been inited, ready to set parms.*/
-    WC_LMS_STATE_PARMSET,    /* Parms are set, ready to MakeKey or Reload. */
+    WC_LMS_STATE_INITED,     /* Key has been inited, ready to set params.*/
+    WC_LMS_STATE_PARMSET,    /* Params are set, ready to MakeKey or Reload. */
     WC_LMS_STATE_OK,         /* Able to sign signatures and verify. */
     WC_LMS_STATE_VERIFYONLY, /* A public only LmsKey. */
     WC_LMS_STATE_BAD,        /* Can't guarantee key's state. */
diff --git a/wolfssl/wolfcrypt/xmss.h b/wolfssl/wolfcrypt/xmss.h
index 7cd8f27ffa..70f26c484c 100644
--- a/wolfssl/wolfcrypt/xmss.h
+++ b/wolfssl/wolfcrypt/xmss.h
@@ -104,8 +104,8 @@ enum wc_XmssRc {
 /* enum wc_XmssState is to help track the state of an XMSS Key. */
 enum wc_XmssState {
     WC_XMSS_STATE_FREED,      /* Key has been freed from memory. */
-    WC_XMSS_STATE_INITED,     /* Key has been inited, ready to set parms.*/
-    WC_XMSS_STATE_PARMSET,    /* Parms are set, ready to MakeKey or Reload. */
+    WC_XMSS_STATE_INITED,     /* Key has been inited, ready to set params.*/
+    WC_XMSS_STATE_PARMSET,    /* Params are set, ready to MakeKey or Reload. */
     WC_XMSS_STATE_OK,         /* Able to sign signatures and verify. */
     WC_XMSS_STATE_VERIFYONLY, /* A public only XmssKey. */
     WC_XMSS_STATE_BAD,        /* Can't guarantee key's state. */