From d2219006d215f62c79a56e0f0a2cb1ba37b049d7 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Mon, 29 Jan 2024 15:57:15 -0600 Subject: [PATCH 01/85] Refactor --- pqc/kem/__init__.py | 6 ------ pqc/kem/hqc_128.py | 10 +++++++--- pqc/kem/hqc_192.py | 10 +++++++--- pqc/kem/hqc_256.py | 10 +++++++--- pqc/kem/kyber1024.py | 11 ++++++----- pqc/kem/kyber512.py | 11 ++++++----- pqc/kem/kyber768.py | 11 ++++++----- pqc/kem/mceliece348864.py | 6 +++--- pqc/kem/mceliece460896.py | 6 +++--- pqc/kem/mceliece6688128.py | 8 ++++---- pqc/kem/mceliece6960119.py | 6 +++--- pqc/kem/mceliece8192128.py | 6 +++--- pqc/sign/dilithium2.py | 12 ++++++------ pqc/sign/dilithium3.py | 12 ++++++------ pqc/sign/dilithium5.py | 12 ++++++------ pqc/sign/falcon_1024.py | 12 +++++------- pqc/sign/falcon_512.py | 12 +++++------- pqc/sign/sphincs_sha2_128f_simple.py | 12 ++++++------ pqc/sign/sphincs_sha2_128s_simple.py | 12 ++++++------ pqc/sign/sphincs_sha2_192f_simple.py | 12 ++++++------ pqc/sign/sphincs_sha2_192s_simple.py | 12 ++++++------ pqc/sign/sphincs_sha2_256f_simple.py | 12 ++++++------ pqc/sign/sphincs_sha2_256s_simple.py | 12 ++++++------ pqc/sign/sphincs_shake_128f_simple.py | 12 ++++++------ pqc/sign/sphincs_shake_128s_simple.py | 12 ++++++------ pqc/sign/sphincs_shake_192f_simple.py | 12 ++++++------ pqc/sign/sphincs_shake_192s_simple.py | 12 ++++++------ pqc/sign/sphincs_shake_256f_simple.py | 12 ++++++------ pqc/sign/sphincs_shake_256s_simple.py | 12 ++++++------ pyproject.toml | 2 +- 30 files changed, 156 insertions(+), 151 deletions(-) delete mode 100644 pqc/kem/__init__.py diff --git a/pqc/kem/__init__.py b/pqc/kem/__init__.py deleted file mode 100644 index eb00ebe4..00000000 --- a/pqc/kem/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -__all__ = [ - 'mceliece348864', - 'mceliece460896', - 'mceliece6688128', - 'mceliece6960119', - 'mceliece8192128'] diff --git a/pqc/kem/hqc_128.py b/pqc/kem/hqc_128.py index d4ea4355..74169b10 100644 --- a/pqc/kem/hqc_128.py +++ b/pqc/kem/hqc_128.py @@ -12,6 +12,10 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') +_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') +_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') +_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') + def keypair(): pk = ffi.new(_T_PUBLICKEY) @@ -20,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -32,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -45,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/hqc_192.py b/pqc/kem/hqc_192.py index e3ed0c9e..fac5f4e3 100644 --- a/pqc/kem/hqc_192.py +++ b/pqc/kem/hqc_192.py @@ -12,6 +12,10 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') +_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') +_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') +_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') + def keypair(): pk = ffi.new(_T_PUBLICKEY) @@ -20,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -32,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -45,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/hqc_256.py b/pqc/kem/hqc_256.py index 309ffec9..30a1df3f 100644 --- a/pqc/kem/hqc_256.py +++ b/pqc/kem/hqc_256.py @@ -12,6 +12,10 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') +_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') +_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') +_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') + def keypair(): pk = ffi.new(_T_PUBLICKEY) @@ -20,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -32,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -45,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/kyber1024.py b/pqc/kem/kyber1024.py index 7ecc10e1..912ffcf7 100644 --- a/pqc/kem/kyber1024.py +++ b/pqc/kem/kyber1024.py @@ -12,8 +12,9 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_indcpa_enc = getattr(lib, f'{_LIB_NAMESPACE}indcpa_enc') -_indcpa_dec = getattr(lib, f'{_LIB_NAMESPACE}indcpa_dec') +_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') +_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') +_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') def keypair(): @@ -23,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -35,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -48,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/kyber512.py b/pqc/kem/kyber512.py index 945e1049..e465486e 100644 --- a/pqc/kem/kyber512.py +++ b/pqc/kem/kyber512.py @@ -12,8 +12,9 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_indcpa_enc = getattr(lib, f'{_LIB_NAMESPACE}indcpa_enc') -_indcpa_dec = getattr(lib, f'{_LIB_NAMESPACE}indcpa_dec') +_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') +_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') +_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') def keypair(): @@ -23,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -35,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -48,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/kyber768.py b/pqc/kem/kyber768.py index 01282a54..de2a257d 100644 --- a/pqc/kem/kyber768.py +++ b/pqc/kem/kyber768.py @@ -12,8 +12,9 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_indcpa_enc = getattr(lib, f'{_LIB_NAMESPACE}indcpa_enc') -_indcpa_dec = getattr(lib, f'{_LIB_NAMESPACE}indcpa_dec') +_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') +_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') +_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') def keypair(): @@ -23,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -35,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -48,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/mceliece348864.py b/pqc/kem/mceliece348864.py index bb92803a..28a38d81 100644 --- a/pqc/kem/mceliece348864.py +++ b/pqc/kem/mceliece348864.py @@ -24,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -36,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -49,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/mceliece460896.py b/pqc/kem/mceliece460896.py index 0fdded86..d8742ee6 100644 --- a/pqc/kem/mceliece460896.py +++ b/pqc/kem/mceliece460896.py @@ -24,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -36,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -49,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/mceliece6688128.py b/pqc/kem/mceliece6688128.py index 30e1d182..08bad48f 100644 --- a/pqc/kem/mceliece6688128.py +++ b/pqc/kem/mceliece6688128.py @@ -1,4 +1,4 @@ -from .._lib.libmceliece66881128f_clean import ffi, lib +from .._lib.libmceliece6688128f_clean import ffi, lib __all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] @@ -24,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -36,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -49,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/mceliece6960119.py b/pqc/kem/mceliece6960119.py index 4a75143f..d3024500 100644 --- a/pqc/kem/mceliece6960119.py +++ b/pqc/kem/mceliece6960119.py @@ -24,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -36,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -49,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/kem/mceliece8192128.py b/pqc/kem/mceliece8192128.py index c5ee1360..602a54cc 100644 --- a/pqc/kem/mceliece8192128.py +++ b/pqc/kem/mceliece8192128.py @@ -24,7 +24,7 @@ def keypair(): errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") return bytes(pk), bytes(sk) @@ -36,7 +36,7 @@ def encap(pk): errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") return bytes(key), bytes(ciphertext) @@ -49,7 +49,7 @@ def decap(ciphertext, sk): errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") return bytes(key) diff --git a/pqc/sign/dilithium2.py b/pqc/sign/dilithium2.py index 7193d8ae..1920d71a 100644 --- a/pqc/sign/dilithium2.py +++ b/pqc/sign/dilithium2.py @@ -1,6 +1,6 @@ from .._lib.libdilithium2_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/dilithium3.py b/pqc/sign/dilithium3.py index a47f2fef..53bdf551 100644 --- a/pqc/sign/dilithium3.py +++ b/pqc/sign/dilithium3.py @@ -1,6 +1,6 @@ from .._lib.libdilithium3_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/dilithium5.py b/pqc/sign/dilithium5.py index d8f1d321..e9f7ee17 100644 --- a/pqc/sign/dilithium5.py +++ b/pqc/sign/dilithium5.py @@ -1,6 +1,6 @@ from .._lib.libdilithium5_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/falcon_1024.py b/pqc/sign/falcon_1024.py index 81168bf8..58db8645 100644 --- a/pqc/sign/falcon_1024.py +++ b/pqc/sign/falcon_1024.py @@ -1,6 +1,6 @@ from .._lib.libfalcon_1024_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -10,7 +10,6 @@ _crypto_sign_keypair = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_keypair') _crypto_sign_signature = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_signature') _crypto_sign_verify = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_verify') -_SIGNATURE_MAXLEN = getattr(lib, f'{_LIB_NAMESPACE}CRYPTO_BYTES') def keypair(): @@ -20,7 +19,7 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) @@ -34,17 +33,16 @@ def sign(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) + _sig = _sigbuf[0:_siglen[0]] # Variable-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - _sig = _sigbuf[0:_siglen[0]] # Non-copying slice operation return bytes(_sig) def verify(sig, m, pk): _sig = ffi.from_buffer(sig) - assert len(_sig) <= _SIGNATURE_MAXLEN _m = ffi.from_buffer(m) @@ -55,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/falcon_512.py b/pqc/sign/falcon_512.py index 48557595..b9fc7194 100644 --- a/pqc/sign/falcon_512.py +++ b/pqc/sign/falcon_512.py @@ -1,6 +1,6 @@ from .._lib.libfalcon_512_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -10,7 +10,6 @@ _crypto_sign_keypair = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_keypair') _crypto_sign_signature = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_signature') _crypto_sign_verify = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_verify') -_SIGNATURE_MAXLEN = getattr(lib, f'{_LIB_NAMESPACE}CRYPTO_BYTES') def keypair(): @@ -20,7 +19,7 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) @@ -34,17 +33,16 @@ def sign(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) + _sig = _sigbuf[0:_siglen[0]] # Variable-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - _sig = _sigbuf[0:_siglen[0]] # Non-copying slice operation return bytes(_sig) def verify(sig, m, pk): _sig = ffi.from_buffer(sig) - assert len(_sig) <= _SIGNATURE_MAXLEN _m = ffi.from_buffer(m) @@ -55,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_sha2_128f_simple.py b/pqc/sign/sphincs_sha2_128f_simple.py index 910ad902..100743d5 100644 --- a/pqc/sign/sphincs_sha2_128f_simple.py +++ b/pqc/sign/sphincs_sha2_128f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_128f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_sha2_128s_simple.py b/pqc/sign/sphincs_sha2_128s_simple.py index 7407f799..a8f8e3ad 100644 --- a/pqc/sign/sphincs_sha2_128s_simple.py +++ b/pqc/sign/sphincs_sha2_128s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_128s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_sha2_192f_simple.py b/pqc/sign/sphincs_sha2_192f_simple.py index 4fd403e0..2e08207f 100644 --- a/pqc/sign/sphincs_sha2_192f_simple.py +++ b/pqc/sign/sphincs_sha2_192f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_192f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_sha2_192s_simple.py b/pqc/sign/sphincs_sha2_192s_simple.py index d51d23c4..9c883781 100644 --- a/pqc/sign/sphincs_sha2_192s_simple.py +++ b/pqc/sign/sphincs_sha2_192s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_192s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_sha2_256f_simple.py b/pqc/sign/sphincs_sha2_256f_simple.py index d09e4b27..5b74fe15 100644 --- a/pqc/sign/sphincs_sha2_256f_simple.py +++ b/pqc/sign/sphincs_sha2_256f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_256f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_sha2_256s_simple.py b/pqc/sign/sphincs_sha2_256s_simple.py index 681d4524..fa17fae4 100644 --- a/pqc/sign/sphincs_sha2_256s_simple.py +++ b/pqc/sign/sphincs_sha2_256s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_256s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_shake_128f_simple.py b/pqc/sign/sphincs_shake_128f_simple.py index d689d434..2a4f7fbc 100644 --- a/pqc/sign/sphincs_shake_128f_simple.py +++ b/pqc/sign/sphincs_shake_128f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_128f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_shake_128s_simple.py b/pqc/sign/sphincs_shake_128s_simple.py index 6f67854e..4487688e 100644 --- a/pqc/sign/sphincs_shake_128s_simple.py +++ b/pqc/sign/sphincs_shake_128s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_128s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_shake_192f_simple.py b/pqc/sign/sphincs_shake_192f_simple.py index b8284828..a53be3dc 100644 --- a/pqc/sign/sphincs_shake_192f_simple.py +++ b/pqc/sign/sphincs_shake_192f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_192f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_shake_192s_simple.py b/pqc/sign/sphincs_shake_192s_simple.py index a381f112..f7762d78 100644 --- a/pqc/sign/sphincs_shake_192s_simple.py +++ b/pqc/sign/sphincs_shake_192s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_192s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_shake_256f_simple.py b/pqc/sign/sphincs_shake_256f_simple.py index 72b7a2f1..37e07f23 100644 --- a/pqc/sign/sphincs_shake_256f_simple.py +++ b/pqc/sign/sphincs_shake_256f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_256f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pqc/sign/sphincs_shake_256s_simple.py b/pqc/sign/sphincs_shake_256s_simple.py index 3e12ebfe..68f39201 100644 --- a/pqc/sign/sphincs_shake_256s_simple.py +++ b/pqc/sign/sphincs_shake_256s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_256s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -19,11 +19,11 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") return bytes(_pk), bytes(_sk) -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) @@ -33,10 +33,10 @@ def signature(m, sk): _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? + assert len(_sig) == _siglen[0] # Fixed-length signature if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") return bytes(_sig) @@ -53,7 +53,7 @@ def verify(sig, m, pk): if errno: if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") return diff --git a/pyproject.toml b/pyproject.toml index 0672548b..5854cc78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.1.post1" +version = "0.0.6.2a0-dev0" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} From d08005deead8d4f270ff39eb92cc249e097b33e0 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 09:41:24 -0600 Subject: [PATCH 02/85] Move patent warning to import-time --- cffi_modules/_common_cffi_maker.py | 8 ++------ cffi_modules/_falcon_cffi_maker.py | 8 +------- cffi_modules/_hqc_cffi_maker.py | 8 +------- cffi_modules/_kyber_cffi_maker.py | 14 +------------- pqc/_util.py | 24 ++++++++++++++---------- pqc/kem/hqc_128.py | 8 ++++++++ pqc/kem/hqc_192.py | 8 ++++++++ pqc/kem/hqc_256.py | 8 ++++++++ pqc/kem/kyber1024.py | 12 ++++++++++++ pqc/kem/kyber512.py | 12 ++++++++++++ pqc/kem/kyber768.py | 12 ++++++++++++ pqc/sign/falcon_1024.py | 6 ++++++ pqc/sign/falcon_512.py | 8 ++++++++ pyproject.toml | 2 +- 14 files changed, 94 insertions(+), 44 deletions(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index dc62976b..ee472c0c 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -6,14 +6,13 @@ import re import warnings -from pqc._util import partition_list, map_immed, extant_with_other_suffix, patent_warning +from pqc._util import partition_list, map_immed, extant_with_other_suffix _NAMESPACE_RE = re.compile(r'(?ms)^#define\s+(CRYPTO_NAMESPACE)\s*\(\s*(\w+)\s*\)\s+(\w+)\s*##\s*\2\s*$') def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, common_sources=frozenset(), - parent_module='pqc._lib', - patent_info=None): + parent_module='pqc._lib'): # 0. local variables # @@ -26,9 +25,6 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # 1. module_name # module_name = f'{parent_module}.{lib_name}' - if patent_info is not None: - patent_message = patent_warning(lib_name, patent_info) - warnings.warn(patent_message) # 2. cdefs, c_header_sources # diff --git a/cffi_modules/_falcon_cffi_maker.py b/cffi_modules/_falcon_cffi_maker.py index ed3febd7..48719eb1 100644 --- a/cffi_modules/_falcon_cffi_maker.py +++ b/cffi_modules/_falcon_cffi_maker.py @@ -5,10 +5,4 @@ def make_falcon_ffi(build_root): common_sources = ['fips202.c', 'randombytes.c'] - patent_info = ( - 2, - ['US7308097B2'], [ - 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] - ) - - return make_sign_ffi(build_root=build_root, common_sources=common_sources, patent_info=patent_info) + return make_sign_ffi(build_root=build_root, common_sources=common_sources) diff --git a/cffi_modules/_hqc_cffi_maker.py b/cffi_modules/_hqc_cffi_maker.py index cb29883d..93eb7951 100644 --- a/cffi_modules/_hqc_cffi_maker.py +++ b/cffi_modules/_hqc_cffi_maker.py @@ -4,10 +4,4 @@ def make_hqc_ffi(build_root): common_sources = ['fips202.c', 'randombytes.c'] - patent_info = ( - 3, [ - 'FR2956541B1/US9094189B2/EP2537284B1',], [ - 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] - ) - - return make_kem_ffi(build_root=build_root, common_sources=common_sources, patent_info=patent_info) + return make_kem_ffi(build_root=build_root, common_sources=common_sources) diff --git a/cffi_modules/_kyber_cffi_maker.py b/cffi_modules/_kyber_cffi_maker.py index 878a8289..d4279be3 100644 --- a/cffi_modules/_kyber_cffi_maker.py +++ b/cffi_modules/_kyber_cffi_maker.py @@ -4,18 +4,6 @@ def make_kyber_ffi(build_root): common_sources = ['fips202.c', 'randombytes.c'] - patent_info = ( - 1,[ - 'FR2956541A1/US9094189B2/EP2537284B1', - 'US9246675/EP2837128B1', - 'potential unknown others'], [ - 'https://ntruprime.cr.yp.to/faq.html', - 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', - 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', - 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', - 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] - ) - extra_cdefs = [dedent("""\ // Exposed internal interface void %(namespace)sindcpa_enc(uint8_t *c, const uint8_t *m, const uint8_t *pk, const uint8_t *coins); @@ -27,4 +15,4 @@ def make_kyber_ffi(build_root): #include "indcpa.h" """)] - return make_kem_ffi(build_root=build_root, extra_c_header_sources=extra_c_header_sources, extra_cdefs=extra_cdefs, common_sources=common_sources, patent_info=patent_info) + return make_kem_ffi(build_root=build_root, extra_c_header_sources=extra_c_header_sources, extra_cdefs=extra_cdefs, common_sources=common_sources) diff --git a/pqc/_util.py b/pqc/_util.py index 6c9b8e00..8bc87a9b 100644 --- a/pqc/_util.py +++ b/pqc/_util.py @@ -32,14 +32,12 @@ def map_immed(f, it, *, splat=False): deque((map if not splat else starmap)(f, it), 0) -def patent_warning(subject, patent_info): - severity, patents, links = patent_info - +def patent_notice(patents, subject, severity, links, stacklevel=0): if severity == 0: - return None + return if severity == 1: - return dedent(f"""\ + warn(dedent(f"""\ {subject} may be protected under patent(s) {'; '.join(patents)}. If you rely on this library via PyPI, it could break at any time if I'm forced by the patentholders to remove this module. Additionally, the patentholders might impose on you *additional* terms, beyond those stated in the software's license. @@ -47,10 +45,12 @@ def patent_warning(subject, patent_info): This is not legal advice. For more information, see: """) + '\n'.join(links) + dedent(f""" - If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it.""") + If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it."""), + stacklevel=2+stacklevel) + return if severity == 2: - return dedent(f"""\ + warn(dedent(f"""\ {subject} may be protected under patent(s) {'; '.join(patents)}. ITS LICENSING STATUS FOR PUBLIC USE IS DISPUTED OR UNKNOWN AT THIS TIME. If you rely on this library via PyPI, it could break at any time if I'm forced by the patentholders to remove this module. @@ -59,10 +59,12 @@ def patent_warning(subject, patent_info): This is not legal advice. For more information, see: """) + '\n'.join(links) + dedent(f""" - If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it.""") + If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it."""), + stacklevel=2+stacklevel) + return if severity == 3: - return dedent(f"""\ + warn(dedent(f"""\ {subject} may be protected under patent(s) {'; '.join(patents)}. IT MIGHT NOT BE LICENSED FOR PUBLIC USE AT THIS TIME. If you rely on this library via PyPI, it could break at any time if I'm forced by the patentholders to remove this module. @@ -71,6 +73,8 @@ def patent_warning(subject, patent_info): This is not legal advice. For more information, see: """) + '\n'.join(links) + dedent(f""" - If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it.""") + If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it."""), + stacklevel=2+stacklevel) + return raise ValueError(f'severity = {severity}') diff --git a/pqc/kem/hqc_128.py b/pqc/kem/hqc_128.py index 74169b10..4d4eabde 100644 --- a/pqc/kem/hqc_128.py +++ b/pqc/kem/hqc_128.py @@ -1,5 +1,13 @@ from .._lib.libhqc_128_clean import ffi, lib +import os +if os.environ.get('LICENSED_HQC', '0') == '0': + from ..util import patent_notice + patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + ) + __all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pqc/kem/hqc_192.py b/pqc/kem/hqc_192.py index fac5f4e3..3ecccb11 100644 --- a/pqc/kem/hqc_192.py +++ b/pqc/kem/hqc_192.py @@ -1,5 +1,13 @@ from .._lib.libhqc_192_clean import ffi, lib +import os +if os.environ.get('LICENSED_HQC', '0') == '0': + from ..util import patent_notice + patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + ) + __all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pqc/kem/hqc_256.py b/pqc/kem/hqc_256.py index 30a1df3f..34e1e290 100644 --- a/pqc/kem/hqc_256.py +++ b/pqc/kem/hqc_256.py @@ -1,5 +1,13 @@ from .._lib.libhqc_256_clean import ffi, lib +import os +if os.environ.get('LICENSED_HQC', '0') == '0': + from ..util import patent_notice + patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + ) + __all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pqc/kem/kyber1024.py b/pqc/kem/kyber1024.py index 912ffcf7..bae29081 100644 --- a/pqc/kem/kyber1024.py +++ b/pqc/kem/kyber1024.py @@ -1,5 +1,17 @@ from .._lib.libkyber1024_clean import ffi, lib +import os +if os.environ.get('LICENSED_KYBER', '0') == '0': + from .._util import patent_notice + patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], + 'the Kyber cryptosystem', 1, [ + 'https://ntruprime.cr.yp.to/faq.html', + 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', + 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', + 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', + 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] + ) + __all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pqc/kem/kyber512.py b/pqc/kem/kyber512.py index e465486e..6f007ef6 100644 --- a/pqc/kem/kyber512.py +++ b/pqc/kem/kyber512.py @@ -1,5 +1,17 @@ from .._lib.libkyber512_clean import ffi, lib +import os +if os.environ.get('LICENSED_KYBER', '0') == '0': + from .._util import patent_notice + patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], + 'the Kyber cryptosystem', 1, [ + 'https://ntruprime.cr.yp.to/faq.html', + 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', + 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', + 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', + 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] + ) + __all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pqc/kem/kyber768.py b/pqc/kem/kyber768.py index de2a257d..f9c3f880 100644 --- a/pqc/kem/kyber768.py +++ b/pqc/kem/kyber768.py @@ -1,5 +1,17 @@ from .._lib.libkyber768_clean import ffi, lib +import os +if os.environ.get('LICENSED_KYBER', '0') == '0': + from .._util import patent_notice + patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], + 'the Kyber cryptosystem', 1, [ + 'https://ntruprime.cr.yp.to/faq.html', + 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', + 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', + 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', + 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] + ) + __all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pqc/sign/falcon_1024.py b/pqc/sign/falcon_1024.py index 58db8645..63dfe54a 100644 --- a/pqc/sign/falcon_1024.py +++ b/pqc/sign/falcon_1024.py @@ -1,5 +1,11 @@ from .._lib.libfalcon_1024_clean import ffi, lib +from .._util import warn_patent +warn_patent(['US7308097B2'], + 'the Falcon cryptosystem', 2, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] +) + __all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pqc/sign/falcon_512.py b/pqc/sign/falcon_512.py index b9fc7194..1965d3cd 100644 --- a/pqc/sign/falcon_512.py +++ b/pqc/sign/falcon_512.py @@ -1,5 +1,13 @@ from .._lib.libfalcon_512_clean import ffi, lib +import os +os.environ.get('LICENSED_FALCON', '0') == '0': + from .._util import warn_patent + warn_patent(['US7308097B2'], + 'the Falcon cryptosystem', 2, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] + ) + __all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') diff --git a/pyproject.toml b/pyproject.toml index 5854cc78..848feba7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev0" +version = "0.0.6.2a0-dev1" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} From 796348910ac72052679d71bcca621b4dd4b6dc7a Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 10:12:39 -0600 Subject: [PATCH 03/85] Fix typos --- pqc/kem/hqc_128.py | 12 ++++-------- pqc/kem/hqc_192.py | 12 ++++-------- pqc/kem/hqc_256.py | 12 ++++-------- pqc/kem/kyber1024.py | 6 +----- pqc/kem/kyber512.py | 6 +----- pqc/kem/kyber768.py | 6 +----- pqc/sign/falcon_1024.py | 12 +++++++----- pqc/sign/falcon_512.py | 10 +++++----- pyproject.toml | 2 +- 9 files changed, 28 insertions(+), 50 deletions(-) diff --git a/pqc/kem/hqc_128.py b/pqc/kem/hqc_128.py index 4d4eabde..4db2fda4 100644 --- a/pqc/kem/hqc_128.py +++ b/pqc/kem/hqc_128.py @@ -2,13 +2,13 @@ import os if os.environ.get('LICENSED_HQC', '0') == '0': - from ..util import patent_notice + from .._util import patent_notice patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], - 'the HQC cryptosystem', 3, - ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] ) -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -20,10 +20,6 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') -_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') -_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') - def keypair(): pk = ffi.new(_T_PUBLICKEY) diff --git a/pqc/kem/hqc_192.py b/pqc/kem/hqc_192.py index 3ecccb11..6df50ca6 100644 --- a/pqc/kem/hqc_192.py +++ b/pqc/kem/hqc_192.py @@ -2,13 +2,13 @@ import os if os.environ.get('LICENSED_HQC', '0') == '0': - from ..util import patent_notice + from .._util import patent_notice patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], - 'the HQC cryptosystem', 3, - ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] ) -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -20,10 +20,6 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') -_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') -_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') - def keypair(): pk = ffi.new(_T_PUBLICKEY) diff --git a/pqc/kem/hqc_256.py b/pqc/kem/hqc_256.py index 34e1e290..5db868a8 100644 --- a/pqc/kem/hqc_256.py +++ b/pqc/kem/hqc_256.py @@ -2,13 +2,13 @@ import os if os.environ.get('LICENSED_HQC', '0') == '0': - from ..util import patent_notice + from .._util import patent_notice patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], - 'the HQC cryptosystem', 3, - ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] ) -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -20,10 +20,6 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') -_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') -_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') - def keypair(): pk = ffi.new(_T_PUBLICKEY) diff --git a/pqc/kem/kyber1024.py b/pqc/kem/kyber1024.py index bae29081..be901ec5 100644 --- a/pqc/kem/kyber1024.py +++ b/pqc/kem/kyber1024.py @@ -12,7 +12,7 @@ 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] ) -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -24,10 +24,6 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') -_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') -_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') - def keypair(): pk = ffi.new(_T_PUBLICKEY) diff --git a/pqc/kem/kyber512.py b/pqc/kem/kyber512.py index 6f007ef6..e22f0cd1 100644 --- a/pqc/kem/kyber512.py +++ b/pqc/kem/kyber512.py @@ -12,7 +12,7 @@ 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] ) -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -24,10 +24,6 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') -_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') -_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') - def keypair(): pk = ffi.new(_T_PUBLICKEY) diff --git a/pqc/kem/kyber768.py b/pqc/kem/kyber768.py index f9c3f880..3b25e067 100644 --- a/pqc/kem/kyber768.py +++ b/pqc/kem/kyber768.py @@ -12,7 +12,7 @@ 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] ) -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -24,10 +24,6 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_pk_gen = getattr(lib, f'{_LIB_NAMESPACE}pk_gen') -_encrypt = getattr(lib, f'{_LIB_NAMESPACE}encrypt') -_deccrypt = getattr(lib, f'{_LIB_NAMESPACE}decrypt') - def keypair(): pk = ffi.new(_T_PUBLICKEY) diff --git a/pqc/sign/falcon_1024.py b/pqc/sign/falcon_1024.py index 63dfe54a..61d957fc 100644 --- a/pqc/sign/falcon_1024.py +++ b/pqc/sign/falcon_1024.py @@ -1,10 +1,12 @@ from .._lib.libfalcon_1024_clean import ffi, lib -from .._util import warn_patent -warn_patent(['US7308097B2'], - 'the Falcon cryptosystem', 2, - ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] -) +import os +if os.environ.get('LICENSED_FALCON', '0') == '0': + from .._util import patent_notice + patent_notice(['US7308097B2'], + 'the Falcon cryptosystem', 2, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] + ) __all__ = ['keypair', 'sign', 'verify'] diff --git a/pqc/sign/falcon_512.py b/pqc/sign/falcon_512.py index 1965d3cd..8728b47a 100644 --- a/pqc/sign/falcon_512.py +++ b/pqc/sign/falcon_512.py @@ -1,11 +1,11 @@ from .._lib.libfalcon_512_clean import ffi, lib import os -os.environ.get('LICENSED_FALCON', '0') == '0': - from .._util import warn_patent - warn_patent(['US7308097B2'], - 'the Falcon cryptosystem', 2, - ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] +if os.environ.get('LICENSED_FALCON', '0') == '0': + from .._util import patent_notice + patent_notice(['US7308097B2'], + 'the Falcon cryptosystem', 2, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] ) __all__ = ['keypair', 'sign', 'verify'] diff --git a/pyproject.toml b/pyproject.toml index 848feba7..f493b714 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev1" +version = "0.0.6.2a0-dev2" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} From a6a84835a66f4d0649217f6629b9afd14a755f3b Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 12:22:37 -0600 Subject: [PATCH 04/85] Document Pain --- .github/workflows/bdist.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 533c7989..20b11b37 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -8,6 +8,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: + # FIXME? PQClean GNU extensions break clang thus MacOS os: [ubuntu-20.04, windows-2019] steps: @@ -24,7 +25,7 @@ jobs: env: CIBW_BUILD_VERBOSITY: 1 # Python 3.6 is EOL - # CFFI or maybe PQClean doesn't seem to work with musllinux + # FIXME? PQClean GNU extensions break musl CIBW_SKIP: "cp36* *-musllinux_*" # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" From 3cd22c299d281aa267668eca142597fbfd2b9ac5 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 13:21:21 -0600 Subject: [PATCH 05/85] PyPy in CI? --- pyproject.toml | 2 +- setup.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f493b714..2d22b490 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev2" +version = "0.0.6.2a0-dev3" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} diff --git a/setup.py b/setup.py index 3035526b..116af1dc 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,10 @@ # https://github.com/pypa/setuptools/issues/1040 from setuptools import setup -from distutils.command.build_ext import build_ext as _build_ext from wheel.bdist_wheel import bdist_wheel as _bdist_wheel -class bdist_wheel_abi_none(_bdist_wheel): +class site_bdist_wheel(_bdist_wheel): """https://github.com/joerick/python-ctypes-package-sample/blob/7db688cd6ee32ae95bce0f75fb7d806926e20252/setup.py#L29""" def finalize_options(self): _bdist_wheel.finalize_options(self) @@ -14,11 +13,16 @@ def finalize_options(self): def get_tag(self): python, abi, plat = _bdist_wheel.get_tag(self) - return "py3", "none", plat + if self.py_limited_api and platform.python_implementation() not in { + 'PyPy', # https://github.com/orgs/pypy/discussions/4884#discussioncomment-8309845 + }: + python = f'py{sys.version_info.major}' + abi = f'abi{sys.version_info.major}' + return python, abi, plat setup( - cmdclass={"bdist_wheel": bdist_wheel_abi_none}, + cmdclass={"bdist_wheel": site_bdist_wheel}, cffi_modules=[ 'cffi_modules/dilithium2_clean.py:ffi', 'cffi_modules/dilithium3_clean.py:ffi', From 03bd1d12b82d5f794ee671d4f7e6a283a2096dde Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 13:28:59 -0600 Subject: [PATCH 06/85] debug test --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 116af1dc..d56c7f27 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,9 @@ def get_tag(self): }: python = f'py{sys.version_info.major}' abi = f'abi{sys.version_info.major}' + else: + import pprint; raise AssertionError(pprint.pformat(locals() | + {'self.py_limited_api': self.py_limited_api, 'platform.python_implementation()': platform.python_implementation()})) return python, abi, plat From 7568f6f01001d4ad39f224634f72173aeefdcfe9 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 13:31:03 -0600 Subject: [PATCH 07/85] fix typo --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index d56c7f27..3df5dfb7 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ # https://foss.heptapod.net/pypy/cffi/-/issues/441 # https://github.com/pypa/setuptools/issues/1040 +import platform from setuptools import setup from wheel.bdist_wheel import bdist_wheel as _bdist_wheel From 11d5c6ab3f48359998136410a6f5a071123528d3 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 13:34:12 -0600 Subject: [PATCH 08/85] fix typo, Python 3.7 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3df5dfb7..24a7d3cf 100644 --- a/setup.py +++ b/setup.py @@ -20,8 +20,8 @@ def get_tag(self): python = f'py{sys.version_info.major}' abi = f'abi{sys.version_info.major}' else: - import pprint; raise AssertionError(pprint.pformat(locals() | - {'self.py_limited_api': self.py_limited_api, 'platform.python_implementation()': platform.python_implementation()})) + import pprint; raise AssertionError(pprint.pformat((locals(), + {'self.py_limited_api': self.py_limited_api, 'platform.python_implementation()': platform.python_implementation()))) return python, abi, plat From 803224893adefdfd9f513f212da4929eccb5ff32 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 13:35:59 -0600 Subject: [PATCH 09/85] fix2 typo2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 24a7d3cf..306cdadc 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def get_tag(self): abi = f'abi{sys.version_info.major}' else: import pprint; raise AssertionError(pprint.pformat((locals(), - {'self.py_limited_api': self.py_limited_api, 'platform.python_implementation()': platform.python_implementation()))) + {'self.py_limited_api': self.py_limited_api, 'platform.python_implementation()': platform.python_implementation()}))) return python, abi, plat From 36735f6bc00d6aaab3627afbea5c8b7439a55ae7 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 13:43:34 -0600 Subject: [PATCH 10/85] this is supposed to work --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 306cdadc..1904ed03 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,9 @@ import platform from setuptools import setup -from wheel.bdist_wheel import bdist_wheel as _bdist_wheel +import wheel.bdist_wheel as _mod_bdist_wheel +_mod_bdist_wheel.PY_LIMITED_API_PATTERN = r'(cp|py)\d' +_bdist_wheel = _mod_bdist_wheel.bdist_wheel class site_bdist_wheel(_bdist_wheel): @@ -19,7 +21,8 @@ def get_tag(self): }: python = f'py{sys.version_info.major}' abi = f'abi{sys.version_info.major}' - else: + if not self.py_limited_api and platform.python_implementation() in {'CPython'}: + # https://github.com/python-cffi/cffi/blob/v1.16.0/src/cffi/setuptools_ext.py#L114 import pprint; raise AssertionError(pprint.pformat((locals(), {'self.py_limited_api': self.py_limited_api, 'platform.python_implementation()': platform.python_implementation()}))) return python, abi, plat From a37c5549e200774417a91a2ad4164436352dd28c Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 14:35:50 -0600 Subject: [PATCH 11/85] more ABI3 stuff --- pyproject.toml | 2 +- setup.py | 31 ++++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2d22b490..c9806bbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev3" +version = "0.0.6.2a0-dev4" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} diff --git a/setup.py b/setup.py index 1904ed03..46f1fe38 100644 --- a/setup.py +++ b/setup.py @@ -3,28 +3,41 @@ import platform from setuptools import setup +import sys import wheel.bdist_wheel as _mod_bdist_wheel -_mod_bdist_wheel.PY_LIMITED_API_PATTERN = r'(cp|py)\d' +#_mod_bdist_wheel.PY_LIMITED_API_PATTERN = r'(cp|py)\d' _bdist_wheel = _mod_bdist_wheel.bdist_wheel +# Pending https://hpyproject.org/ +ABI3_EXCLUDE_IMPLEMENTATIONS = { + 'PyPy', # https://github.com/orgs/pypy/discussions/4884#discussioncomment-8309845 +} + + class site_bdist_wheel(_bdist_wheel): """https://github.com/joerick/python-ctypes-package-sample/blob/7db688cd6ee32ae95bce0f75fb7d806926e20252/setup.py#L29""" + def finalize_options(self): - _bdist_wheel.finalize_options(self) - self.root_is_pure = False + # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L244 + if (all(ext.py_limited_api for ext in self.distribution.ext_modules) + and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS + ): + self.py_limited_api = f'cp{sys.version_info.major}{sys.version_info.minor}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}{sys.version_info.minor}' + super().finalize_options() def get_tag(self): python, abi, plat = _bdist_wheel.get_tag(self) - if self.py_limited_api and platform.python_implementation() not in { - 'PyPy', # https://github.com/orgs/pypy/discussions/4884#discussioncomment-8309845 - }: - python = f'py{sys.version_info.major}' + if self.py_limited_api and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS: + python = f'cp{sys.version_info.major}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}' abi = f'abi{sys.version_info.major}' if not self.py_limited_api and platform.python_implementation() in {'CPython'}: # https://github.com/python-cffi/cffi/blob/v1.16.0/src/cffi/setuptools_ext.py#L114 - import pprint; raise AssertionError(pprint.pformat((locals(), - {'self.py_limited_api': self.py_limited_api, 'platform.python_implementation()': platform.python_implementation()}))) + import pprint; raise AssertionError(pprint.pformat({**locals(), + 'self.py_limited_api': self.py_limited_api, + 'platform.python_implementation()': platform.python_implementation(), + 'self.distribution.ext_modules': [(lambda obj: {k: getattr(obj, k) for k in dir(obj) if not k.startswith('_')})(obj) for obj in self.distribution.ext_modules], + })) return python, abi, plat From d0a2dc7c84782d5220933f0a6f9298a455daf1f6 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 14:46:34 -0600 Subject: [PATCH 12/85] further ABI3 cleanup --- pyproject.toml | 2 +- setup.py | 25 +++++++++---------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c9806bbc..6b88b3ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev4" +version = "0.0.6.2a0-dev5" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} diff --git a/setup.py b/setup.py index 46f1fe38..acac2b0d 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,7 @@ import platform from setuptools import setup import sys -import wheel.bdist_wheel as _mod_bdist_wheel -#_mod_bdist_wheel.PY_LIMITED_API_PATTERN = r'(cp|py)\d' -_bdist_wheel = _mod_bdist_wheel.bdist_wheel - +from wheel.bdist_wheel import bdist_wheel as _bdist_wheel # Pending https://hpyproject.org/ ABI3_EXCLUDE_IMPLEMENTATIONS = { @@ -20,24 +17,20 @@ class site_bdist_wheel(_bdist_wheel): def finalize_options(self): # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L244 - if (all(ext.py_limited_api for ext in self.distribution.ext_modules) - and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS + if (platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS + # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L267 + and (self.distribution.has_ext_modules() or self.distribution.has_c_libraries()) + # https://github.com/pypa/setuptools/blob/v69.0.3/setuptools/command/build_ext.py#L160 + and all(ext.py_limited_api for ext in self.distribution.ext_modules) ): self.py_limited_api = f'cp{sys.version_info.major}{sys.version_info.minor}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}{sys.version_info.minor}' super().finalize_options() def get_tag(self): python, abi, plat = _bdist_wheel.get_tag(self) - if self.py_limited_api and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS: - python = f'cp{sys.version_info.major}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}' - abi = f'abi{sys.version_info.major}' - if not self.py_limited_api and platform.python_implementation() in {'CPython'}: - # https://github.com/python-cffi/cffi/blob/v1.16.0/src/cffi/setuptools_ext.py#L114 - import pprint; raise AssertionError(pprint.pformat({**locals(), - 'self.py_limited_api': self.py_limited_api, - 'platform.python_implementation()': platform.python_implementation(), - 'self.distribution.ext_modules': [(lambda obj: {k: getattr(obj, k) for k in dir(obj) if not k.startswith('_')})(obj) for obj in self.distribution.ext_modules], - })) + #if self.py_limited_api and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS: + # python = f'cp{sys.version_info.major}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}' + # abi = f'abi{sys.version_info.major}' return python, abi, plat From a31e034fbb07a26d2df29303b9ad6acdae467186 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 14:51:05 -0600 Subject: [PATCH 13/85] Last element of cleanup for ABI3? --- pyproject.toml | 2 +- setup.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6b88b3ec..04396e0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev5" +version = "0.0.6.2a0-dev6" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} diff --git a/setup.py b/setup.py index acac2b0d..0a512f81 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,9 @@ def finalize_options(self): def get_tag(self): python, abi, plat = _bdist_wheel.get_tag(self) - #if self.py_limited_api and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS: - # python = f'cp{sys.version_info.major}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}' - # abi = f'abi{sys.version_info.major}' + if self.py_limited_api and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS: + python = f'cp{sys.version_info.major}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}' + abi = f'abi{sys.version_info.major}' return python, abi, plat From 5c8d9941164251a1f7c41ce2a0b33268a15c8d96 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:11:01 -0600 Subject: [PATCH 14/85] CONFOUND IT https://github.com/pypa/cibuildwheel/blob/v2.16.3/cibuildwheel/util.py#L653 --- pyproject.toml | 2 +- setup.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 04396e0c..0fd67ffc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev6" +version = "0.0.6.2a0-dev7" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} diff --git a/setup.py b/setup.py index 0a512f81..79841898 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,8 @@ def finalize_options(self): def get_tag(self): python, abi, plat = _bdist_wheel.get_tag(self) if self.py_limited_api and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS: - python = f'cp{sys.version_info.major}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}' + # https://github.com/pypa/cibuildwheel/blob/v2.16.3/cibuildwheel/util.py#L653 + python = f'cp{sys.version_info.major}{sys.version_info.minor}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}{sys.version_info.minor}' abi = f'abi{sys.version_info.major}' return python, abi, plat From 74b65fc2800c7b9e0136956059caedbab696bb63 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:16:03 -0600 Subject: [PATCH 15/85] Supporting cp36 isn't that hard --- .github/workflows/bdist.yaml | 3 +-- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 20b11b37..e753d302 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -24,9 +24,8 @@ jobs: run: python -m cibuildwheel --output-dir wheelhouse env: CIBW_BUILD_VERBOSITY: 1 - # Python 3.6 is EOL # FIXME? PQClean GNU extensions break musl - CIBW_SKIP: "cp36* *-musllinux_*" + CIBW_SKIP: "*-musllinux_*" # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" diff --git a/pyproject.toml b/pyproject.toml index 0fd67ffc..23ad5b5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev7" +version = "0.0.6.2a0-dev8" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} @@ -16,5 +16,5 @@ namespaces = false requires = [ 'cffi >= 1.14.5', 'setuptools >= 49.5.0', - 'wheel >= 0.38.0', + 'wheel >= 0.30.0', ] From dca7f326d450b729c1a48cc166760e645d4b2cc4 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:29:06 -0600 Subject: [PATCH 16/85] Fix CI on Windows https://cibuildwheel.readthedocs.io/en/stable/options/#repair-wheel-command --- .github/workflows/bdist.yaml | 3 ++- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index e753d302..ccaa7608 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -24,8 +24,9 @@ jobs: run: python -m cibuildwheel --output-dir wheelhouse env: CIBW_BUILD_VERBOSITY: 1 + # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl - CIBW_SKIP: "*-musllinux_*" + CIBW_SKIP: "*36-win* *-musllinux_*" # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" diff --git a/pyproject.toml b/pyproject.toml index 23ad5b5c..802900c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev8" +version = "0.0.6.2a0-dev9" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} From 3fee780f5270c0e3a1eb40eddaf9aa2e32447623 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:41:28 -0600 Subject: [PATCH 17/85] CI: factor out PyPy into a separate "job" --- .github/workflows/bdist.yaml | 9 ++++++++- pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index ccaa7608..7bc1bf95 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -4,12 +4,18 @@ on: [push] jobs: build_wheels: - name: bdist_wheel on ${{ matrix.os }} + name: bdist_wheel for ${{ matrix.cibw-impl-name }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: matrix: # FIXME? PQClean GNU extensions break clang thus MacOS os: [ubuntu-20.04, windows-2019] + cibw-impl: ["cp3*", "pp3*"] + include: + - cibw-impl: "cp3*" + cibw-impl-name: "CPython 3.X" + - cibw-impl: "pp3*" + cibw-impl-name: "PyPy 3.X" steps: - uses: actions/checkout@v4 @@ -23,6 +29,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: + CIBW_BUILD "${{ matrix.cibw-impl }}*" CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl diff --git a/pyproject.toml b/pyproject.toml index 802900c7..fcbafda0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev9" +version = "0.0.6.2a0-dev10" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} From f7d7df9888b955e5a85837d0a26c33091163017a Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:45:42 -0600 Subject: [PATCH 18/85] YAML syntax --- .github/workflows/bdist.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 7bc1bf95..33aaed01 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -4,7 +4,7 @@ on: [push] jobs: build_wheels: - name: bdist_wheel for ${{ matrix.cibw-impl-name }} on ${{ matrix.os }} + name: bdist_wheel for ${{ matrix.cibw_impl_name }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: matrix: @@ -12,10 +12,10 @@ jobs: os: [ubuntu-20.04, windows-2019] cibw-impl: ["cp3*", "pp3*"] include: - - cibw-impl: "cp3*" - cibw-impl-name: "CPython 3.X" - - cibw-impl: "pp3*" - cibw-impl-name: "PyPy 3.X" + - cibw_impl: "cp3*" + cibw_impl_name: "CPython 3.X" + - cibw_impl: "pp3*" + cibw_impl_name: "PyPy 3.X" steps: - uses: actions/checkout@v4 @@ -29,7 +29,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: - CIBW_BUILD "${{ matrix.cibw-impl }}*" + CIBW_BUILD "${{ matrix.cibw_impl }}*" CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl From fe419b47b9f1baa3407508b4772e0c457ecc94fe Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:49:12 -0600 Subject: [PATCH 19/85] YAML2 https://stackoverflow.com/a/66705463/1874170 --- .github/workflows/bdist.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 33aaed01..e1a05c79 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -29,7 +29,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: - CIBW_BUILD "${{ matrix.cibw_impl }}*" + CIBW_BUILD ${{ matrix.cibw_impl }}* CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl From 8cef72237190fb5bf1489735bd1c0dacb032b665 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:49:52 -0600 Subject: [PATCH 20/85] YAML3 --- .github/workflows/bdist.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index e1a05c79..ccad7b78 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -29,7 +29,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: - CIBW_BUILD ${{ matrix.cibw_impl }}* + CIBW_BUILD ${{ matrix.cibw_impl }} CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl From 52e2cde3e22d01c07b24c0936405256198418d73 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:51:18 -0600 Subject: [PATCH 21/85] YAML99 --- .github/workflows/bdist.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index ccad7b78..5c1154b0 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -29,7 +29,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: - CIBW_BUILD ${{ matrix.cibw_impl }} + CIBW_BUILD: ${{ matrix.cibw_impl }} CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl From 3beefd92643b177fc2ddf83f963fed0a387f1483 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:52:33 -0600 Subject: [PATCH 22/85] GitHub Actions fix name --- .github/workflows/bdist.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 5c1154b0..198214b4 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -10,11 +10,11 @@ jobs: matrix: # FIXME? PQClean GNU extensions break clang thus MacOS os: [ubuntu-20.04, windows-2019] - cibw-impl: ["cp3*", "pp3*"] + cibw-impl: ["cp3", "pp3"] include: - - cibw_impl: "cp3*" + - cibw_impl: "cp3" cibw_impl_name: "CPython 3.X" - - cibw_impl: "pp3*" + - cibw_impl: "pp3" cibw_impl_name: "PyPy 3.X" steps: @@ -29,7 +29,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: - CIBW_BUILD: ${{ matrix.cibw_impl }} + CIBW_BUILD: "${{ matrix.cibw_impl }}*" CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl From a91ed3b04257b17ea96595e6ee8ab1ade7e4e4db Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 15:53:50 -0600 Subject: [PATCH 23/85] last YAML typo --- .github/workflows/bdist.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 198214b4..07e9ce82 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -10,7 +10,7 @@ jobs: matrix: # FIXME? PQClean GNU extensions break clang thus MacOS os: [ubuntu-20.04, windows-2019] - cibw-impl: ["cp3", "pp3"] + cibw_impl: ["cp3", "pp3"] include: - cibw_impl: "cp3" cibw_impl_name: "CPython 3.X" From 6b41570b2ec0017b39f68fb4ceb67982dbd5c8a1 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 16:01:15 -0600 Subject: [PATCH 24/85] CI: more tidying --- .github/workflows/bdist.yaml | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 07e9ce82..99dbf3a6 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -4,18 +4,22 @@ on: [push] jobs: build_wheels: - name: bdist_wheel for ${{ matrix.cibw_impl_name }} on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + name: bdist_wheel for ${{ matrix.py-impl }} on ${{ matrix.os }} + runs-on: ${{ matrix.github_os }} strategy: matrix: - # FIXME? PQClean GNU extensions break clang thus MacOS - os: [ubuntu-20.04, windows-2019] - cibw_impl: ["cp3", "pp3"] + # FIXME? PQClean GNU extensions break clang thus the default toolchain used by CIBW on Github Actions MacOS + os: [Windows, Linux] + py-impl: ["CPython 3.X", "PyPy 3.X"] include: - - cibw_impl: "cp3" - cibw_impl_name: "CPython 3.X" - - cibw_impl: "pp3" - cibw_impl_name: "PyPy 3.X" + - os: Windows + github_os: windows-2019 + - os: Linux + github_os: ubuntu-20.04 + - py-impl: "CPython 3.X" + cibw_build: "cp3*" + - py-impl: "PyPy 3.X" + cibw_build: "pp3*" steps: - uses: actions/checkout@v4 @@ -29,7 +33,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: - CIBW_BUILD: "${{ matrix.cibw_impl }}*" + CIBW_BUILD: ${{ matrix.cibw_impl }} CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl From 08c58e8506e6403ce625b12971b4b4c5b61eedd4 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 16:05:04 -0600 Subject: [PATCH 25/85] CI: does it Mac? --- .github/workflows/bdist.yaml | 4 +++- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 99dbf3a6..c9c0b086 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -9,11 +9,13 @@ jobs: strategy: matrix: # FIXME? PQClean GNU extensions break clang thus the default toolchain used by CIBW on Github Actions MacOS - os: [Windows, Linux] + os: [Windows, Mac, Linux] py-impl: ["CPython 3.X", "PyPy 3.X"] include: - os: Windows github_os: windows-2019 + - os: Mac + github_os: macos-11 - os: Linux github_os: ubuntu-20.04 - py-impl: "CPython 3.X" diff --git a/pyproject.toml b/pyproject.toml index fcbafda0..6397ba06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev10" +version = "0.0.6.2a1-dev0" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} From 8be9245904a0a4e6b6d8b316dcd809c52f7efe97 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 16:17:00 -0600 Subject: [PATCH 26/85] Try Makefile CFLAGs --- cffi_modules/_common_cffi_maker.py | 13 +++++++++++-- pyproject.toml | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index ee472c0c..726a8821 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -78,7 +78,17 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, included_ffis = [] - extra_compile_args = [] + extra_compile_args = makefile_parsed['CFLAGS'] + + include_dirs = [build_root] + + _include_compile_args, extra_compile_args = partition_list( + lambda arg: not re.match(r'-I(.+)', arg), + extra_compile_args + ) + for arg in _include_compile_args: + include_dirs.append(build_root / arg[2:]) + if platform.system() == 'Windows': # https://foss.heptapod.net/pypy/cffi/-/issues/516 # https://www.reddit.com/r/learnpython/comments/175js2u/def_extern_says_im_not_using_it_in_api_mode/ @@ -91,7 +101,6 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # https://learn.microsoft.com/en-us/windows/win32/seccrypto/required-libraries libraries.append('Advapi32') - include_dirs = [(build_root), (common_dir)] # 5. create, return # diff --git a/pyproject.toml b/pyproject.toml index fcbafda0..6a26bb82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2a0-dev10" +version = "0.0.6.2a2-dev0" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} From d436ccf217778004234f30f1ead52d52d4a0a140 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 16:28:07 -0600 Subject: [PATCH 27/85] Fix idiot typo --- cffi_modules/_common_cffi_maker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index 726a8821..1850c357 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -78,7 +78,7 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, included_ffis = [] - extra_compile_args = makefile_parsed['CFLAGS'] + extra_compile_args = makefile_parsed['CFLAGS'].split() include_dirs = [build_root] @@ -101,7 +101,6 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # https://learn.microsoft.com/en-us/windows/win32/seccrypto/required-libraries libraries.append('Advapi32') - # 5. create, return # ffibuilder = FFI() From b48cd1f381954538cb4456d8a12c72704ce798c9 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 16:31:27 -0600 Subject: [PATCH 28/85] double idiot typo --- cffi_modules/_common_cffi_maker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index 1850c357..7a97e85a 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -82,7 +82,7 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, include_dirs = [build_root] - _include_compile_args, extra_compile_args = partition_list( + extra_compile_args, _include_compile_args = partition_list( lambda arg: not re.match(r'-I(.+)', arg), extra_compile_args ) From aeff73b866f57f2cd638d0c93d01c8e98ab574d2 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 16:39:18 -0600 Subject: [PATCH 29/85] Worst Practices --- cffi_modules/_common_cffi_maker.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index 7a97e85a..9ab1a24e 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -77,10 +77,11 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # 4. included_ffis, extra_compile_args, libraries, include_dirs # included_ffis = [] - extra_compile_args = makefile_parsed['CFLAGS'].split() - include_dirs = [build_root] + libraries = [] + + # Modifications extra_compile_args, _include_compile_args = partition_list( lambda arg: not re.match(r'-I(.+)', arg), @@ -89,13 +90,13 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, for arg in _include_compile_args: include_dirs.append(build_root / arg[2:]) + extra_compile_args.remove('-Werror') # FIXME + if platform.system() == 'Windows': # https://foss.heptapod.net/pypy/cffi/-/issues/516 # https://www.reddit.com/r/learnpython/comments/175js2u/def_extern_says_im_not_using_it_in_api_mode/ # https://learn.microsoft.com/en-us/cpp/build/reference/tc-tp-tc-tp-specify-source-file-type?view=msvc-170 extra_compile_args.append('/TC') - - libraries = [] if platform.system() == 'Windows': # https://stackoverflow.com/questions/69900013/link-error-cannot-build-python-c-extension-in-windows # https://learn.microsoft.com/en-us/windows/win32/seccrypto/required-libraries From c2aeb57dce2730ea4a41c382f66a1662161fc1f0 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 16:59:15 -0600 Subject: [PATCH 30/85] bworst practices --- cffi_modules/_common_cffi_maker.py | 35 ++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index 9ab1a24e..a350448e 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -18,6 +18,7 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, build_root = Path(build_root) makefile_parsed = parse_makefile(build_root / 'Makefile') + cflag_makefile_parsed = makefile_parsed if platform.system() != 'Windows' else parse_makefile(build_root / 'Makefile.microsoft_nmake') common_dir = build_root / '..' / '..' / '..' / 'common' _lib_name = Path(makefile_parsed['LIB']).stem lib_name = _lib_name.replace('-', '_') @@ -77,27 +78,39 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # 4. included_ffis, extra_compile_args, libraries, include_dirs # included_ffis = [] - extra_compile_args = makefile_parsed['CFLAGS'].split() + extra_compile_args = cflag_makefile_parsed['CFLAGS'].split() include_dirs = [build_root] libraries = [] # Modifications - extra_compile_args, _include_compile_args = partition_list( - lambda arg: not re.match(r'-I(.+)', arg), - extra_compile_args - ) - for arg in _include_compile_args: - include_dirs.append(build_root / arg[2:]) - - extra_compile_args.remove('-Werror') # FIXME - + # * Move "include" flags to setuptools + _to_pop = [] + for i, arg in enumerate(extra_compile_args): + if arg.startswith('-I'): + include_dirs.append(build_root / arg[2:]) + _to_pop.extend([i]) + if arg.startswith('/I'): + include_dirs.append(build_root / extra_compile_args[i+1]) + _to_pop.extend([i, i+1]) + map_immed(extra_compile_args.pop, reversed(_to_pop)) + + # * FIXME don't make errors fatal + _to_pop = [] + for i, arg in enumerate(extra_compile_args): + if arg.startswith('-Werror'): + _to_pop.extend([i]) + if arg == '/WX': + _to_pop.extend([i]) + map_immed(extra_compile_args.pop, reversed(_to_pop)) + + # * Other Windows compiler fixes if platform.system() == 'Windows': # https://foss.heptapod.net/pypy/cffi/-/issues/516 # https://www.reddit.com/r/learnpython/comments/175js2u/def_extern_says_im_not_using_it_in_api_mode/ # https://learn.microsoft.com/en-us/cpp/build/reference/tc-tp-tc-tp-specify-source-file-type?view=msvc-170 extra_compile_args.append('/TC') - if platform.system() == 'Windows': + # https://stackoverflow.com/questions/69900013/link-error-cannot-build-python-c-extension-in-windows # https://learn.microsoft.com/en-us/windows/win32/seccrypto/required-libraries libraries.append('Advapi32') From c254ef2014992daf46f668850f97aa7518106682 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 18:44:59 -0600 Subject: [PATCH 31/85] CI: maybe a fix? If this works, h/t Behdad Esfabod https://github.com/harfbuzz/harfbuzz/issues/1763#issuecomment-500574443 --- cffi_modules/_common_cffi_maker.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index a350448e..5d115d94 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -115,6 +115,13 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # https://learn.microsoft.com/en-us/windows/win32/seccrypto/required-libraries libraries.append('Advapi32') + # * Other Mac OS compiler fixes + if platform.system() == 'Darwin': + # https://github.com/JamesTheAwesomeDude/pypqc/issues/9 + # https://github.com/actions/runner-images/issues/1938 + # https://github.com/harfbuzz/harfbuzz/issues/1763#issuecomment-500574443 + extra_compile_args.append('-DHB_NO_PRAGMA_GCC_DIAGNOSTIC_ERROR') + # 5. create, return # ffibuilder = FFI() From 6d2254dc520032bbab6b037f808edb9e9b835a0c Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 19:18:00 -0600 Subject: [PATCH 32/85] CI: more surgical Mac OS efforts --- cffi_modules/_common_cffi_maker.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index 5d115d94..5d37600c 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -120,7 +120,12 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # https://github.com/JamesTheAwesomeDude/pypqc/issues/9 # https://github.com/actions/runner-images/issues/1938 # https://github.com/harfbuzz/harfbuzz/issues/1763#issuecomment-500574443 - extra_compile_args.append('-DHB_NO_PRAGMA_GCC_DIAGNOSTIC_ERROR') + extra_compile_args.extend([ + '-DHB_NO_PRAGMA_GCC_DIAGNOSTIC_ERROR', + '-Wno-error=implicit-function-declaration', + '-Wno-error=pedantic', + '-Wno-error=macro-redefined', + ]) # 5. create, return # From 5427be4ba07e640d7ed4730e12ce02cb3f9e33a8 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 19:39:37 -0600 Subject: [PATCH 33/85] CI: back off on Mac OS -Wno-error=... --- cffi_modules/_common_cffi_maker.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index 5d37600c..1c1367c3 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -119,11 +119,8 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, if platform.system() == 'Darwin': # https://github.com/JamesTheAwesomeDude/pypqc/issues/9 # https://github.com/actions/runner-images/issues/1938 - # https://github.com/harfbuzz/harfbuzz/issues/1763#issuecomment-500574443 extra_compile_args.extend([ - '-DHB_NO_PRAGMA_GCC_DIAGNOSTIC_ERROR', '-Wno-error=implicit-function-declaration', - '-Wno-error=pedantic', '-Wno-error=macro-redefined', ]) From 10c722c9aaae706deed067a8925ad9eadc0087d9 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 20:08:45 -0600 Subject: [PATCH 34/85] CI: remove CPython 3.6 after all Currently it's causing cibw to output malformed packages, and it is deprecated by the PSF anyway... --- .github/workflows/bdist.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index c9c0b086..7c651b7e 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -39,7 +39,7 @@ jobs: CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl - CIBW_SKIP: "*36-win* *-musllinux_*" + CIBW_SKIP: "cp36-* *-musllinux_*" # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" From d9c51c0409b30c0327a1870a5e83125c95600a3d Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 20:45:41 -0600 Subject: [PATCH 35/85] CI: fix typo --- .github/workflows/bdist.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml index 7c651b7e..5c62b0f6 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/bdist.yaml @@ -35,7 +35,7 @@ jobs: - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: - CIBW_BUILD: ${{ matrix.cibw_impl }} + CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) # FIXME? PQClean GNU extensions break musl From 00ae5010217367856cc8a7b299fbc317038cc465 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:14:31 -0600 Subject: [PATCH 36/85] Include sdist in Github CI https://www.github.com/pypa/cibuildwheel/issues/173#issuecomment-1501236916 --- .../{bdist.yaml => python_build.yaml} | 48 ++++++++++++++----- 1 file changed, 35 insertions(+), 13 deletions(-) rename .github/workflows/{bdist.yaml => python_build.yaml} (58%) diff --git a/.github/workflows/bdist.yaml b/.github/workflows/python_build.yaml similarity index 58% rename from .github/workflows/bdist.yaml rename to .github/workflows/python_build.yaml index 5c62b0f6..bae42cf8 100644 --- a/.github/workflows/bdist.yaml +++ b/.github/workflows/python_build.yaml @@ -3,41 +3,62 @@ name: Build on: [push] jobs: - build_wheels: - name: bdist_wheel for ${{ matrix.py-impl }} on ${{ matrix.os }} + sdist: + name: Source build for pip + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + - name: install requirements + run: python -m pip install -r requirements.txt + - name: build sdist + run: python -m build --sdist && twine check ./dist/* + - name: upload sdist + uses: actions/upload-artifact@v4 + with: + name: sdist + path: ./dist/* + + bdist: + name: Binary build for ${{ matrix.py-impl }} on ${{ matrix.os }} runs-on: ${{ matrix.github_os }} strategy: matrix: - # FIXME? PQClean GNU extensions break clang thus the default toolchain used by CIBW on Github Actions MacOS - os: [Windows, Mac, Linux] py-impl: ["CPython 3.X", "PyPy 3.X"] + os: [Windows, Mac, Linux] + include: + - py-impl: "CPython 3.X" + cibw_build: "cp3*" + - py-impl: "PyPy 3.X" + cibw_build: "pp3*" + # Use the oldest OSes available for compatibility - os: Windows github_os: windows-2019 - os: Mac github_os: macos-11 - os: Linux github_os: ubuntu-20.04 - - py-impl: "CPython 3.X" - cibw_build: "cp3*" - - py-impl: "PyPy 3.X" - cibw_build: "pp3*" steps: - - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: sdist + path: ./dist/ # Used to host cibuildwheel - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 - name: Install cibuildwheel run: python -m pip install cibuildwheel==2.16.3 - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse + # https://github.com/pypa/cibuildwheel/issues/173#issuecomment-1501236916 + run: python -m cibuildwheel ./dist/*.tar.gz --output-dir ./dist env: CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_BUILD_VERBOSITY: 1 - # FIXME? cibuildwheel doesn't property implement delvewheel yet (delvewheel needs Python 3.7+ runtime, but CPython doesn't support using non-current runtimes for REPAIR_WHEEL_COMMAND) + # FIXME? cibuildwheel disagrees with CPython 3.6 in some way # FIXME? PQClean GNU extensions break musl CIBW_SKIP: "cp36-* *-musllinux_*" # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E @@ -47,4 +68,5 @@ jobs: - uses: actions/upload-artifact@v4 with: name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} - path: ./wheelhouse/*.whl + path: ./dist/*.whl + From bc59cff3c4f6972dc52af0c6fadbca883518b08b Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:20:08 -0600 Subject: [PATCH 37/85] CI: fix sdist --- .github/workflows/python_build.yaml | 4 ++-- requirements-dev.txt | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index bae42cf8..9522ca8a 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -9,8 +9,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 - - name: install requirements - run: python -m pip install -r requirements.txt + - name: install dev requirements + run: python -m pip install -r requirements-dev.txt - name: build sdist run: python -m build --sdist && twine check ./dist/* - name: upload sdist diff --git a/requirements-dev.txt b/requirements-dev.txt index 14f55940..f1c482cf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,5 @@ -pip >= 18.0 -cffi >= 1.15.0 +cffi >= 1.14.5 setuptools >= 49.5.0 +wheel >= 0.30.0 +# Beyond "build" requirements +twine >= 1.15.0 From daa623ced307f9c986eba8822030bc0b99c3b8ee Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:21:53 -0600 Subject: [PATCH 38/85] fix stupid typo --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index f1c482cf..d6f2b699 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,4 +2,5 @@ cffi >= 1.14.5 setuptools >= 49.5.0 wheel >= 0.30.0 # Beyond "build" requirements +build >= 0.6.0 twine >= 1.15.0 From 46566fb7f844f1c2c199e9511554ffd639ebf023 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:22:52 -0600 Subject: [PATCH 39/85] last line in sdist CI https://docs.github.com/en/actions/using-workflows/storing-workflow-data-as-artifacts --- .github/workflows/python_build.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 9522ca8a..c8c5718f 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -21,6 +21,7 @@ jobs: bdist: name: Binary build for ${{ matrix.py-impl }} on ${{ matrix.os }} + needs: sdist runs-on: ${{ matrix.github_os }} strategy: matrix: From 9dc4875e2f4cad59a59d74e4e478242b27a2e93a Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:24:55 -0600 Subject: [PATCH 40/85] Undo "setup-python@v4" regression --- .github/workflows/python_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index c8c5718f..94bc9791 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -48,7 +48,7 @@ jobs: path: ./dist/ # Used to host cibuildwheel - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v3 - name: Install cibuildwheel run: python -m pip install cibuildwheel==2.16.3 From f69912a7c5c06b796d9aa1ccbffc629d6f88897c Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:38:17 -0600 Subject: [PATCH 41/85] CI: Windows doesn't have shell globbing --- .github/workflows/python_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 94bc9791..f5e9a6b8 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -55,7 +55,7 @@ jobs: - name: Build wheels # https://github.com/pypa/cibuildwheel/issues/173#issuecomment-1501236916 - run: python -m cibuildwheel ./dist/*.tar.gz --output-dir ./dist + run: python -c "import glob,pathlib,runpy,sys; sys.argv[:] = ['cibuildwheel', *glob.glob('./dist/*'), '--output-dir', './dist']; runpy.run_module('cibuildwheel');" env: CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_BUILD_VERBOSITY: 1 From f23e3d36d231dfd273ec4120203170066940f4f2 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:51:20 -0600 Subject: [PATCH 42/85] CI: Windows globbing fixes --- .github/workflows/python_build.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index f5e9a6b8..bcc0a3c5 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -46,6 +46,11 @@ jobs: with: name: sdist path: ./dist/ + - uses: tj-actions/glob@v20 + # FIXME? use a more programmatic or integrated solution here + id: sdist_glob + with: + files: ./dist/* # Used to host cibuildwheel - uses: actions/setup-python@v3 @@ -55,7 +60,7 @@ jobs: - name: Build wheels # https://github.com/pypa/cibuildwheel/issues/173#issuecomment-1501236916 - run: python -c "import glob,pathlib,runpy,sys; sys.argv[:] = ['cibuildwheel', *glob.glob('./dist/*'), '--output-dir', './dist']; runpy.run_module('cibuildwheel');" + run: python -m cibuildwheel ${{ steps.glob.outputs.paths }} --output-dir ./dist env: CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_BUILD_VERBOSITY: 1 @@ -70,4 +75,5 @@ jobs: with: name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} path: ./dist/*.whl + if-no-files-found: error From 45348b04f5c9d2e7da29eb1c190375e5b27ad2a6 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Tue, 30 Jan 2024 21:53:28 -0600 Subject: [PATCH 43/85] Fix typo --- .github/workflows/python_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index bcc0a3c5..8c2d126d 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -60,7 +60,7 @@ jobs: - name: Build wheels # https://github.com/pypa/cibuildwheel/issues/173#issuecomment-1501236916 - run: python -m cibuildwheel ${{ steps.glob.outputs.paths }} --output-dir ./dist + run: python -m cibuildwheel ${{ steps.sdist_glob.outputs.paths }} --output-dir ./dist env: CIBW_BUILD: ${{ matrix.cibw_build }} CIBW_BUILD_VERBOSITY: 1 From 7950283da68b7b0ebfb9100071c58ca49719b604 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 14:23:17 -0600 Subject: [PATCH 44/85] Don't depend on CFFI on PyPy --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 21a1ff57..cca359c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ [project] name = "pypqc" -version = "0.0.6.2-rc1" +version = "0.0.6.2-rc3" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} dependencies = [ - 'cffi >= 1.0.0', + 'cffi >= 1.0.0;platform_python_implementation != "PyPy"', ] [tool.setuptools.packages.find] From 666a4643f908713f1a52fe66087af35cbd8f501c Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 14:31:21 -0600 Subject: [PATCH 45/85] CI: run on pull requests --- .github/workflows/python_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 8c2d126d..e345f15f 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -1,6 +1,6 @@ name: Build -on: [push] +on: [pull_request, push] jobs: sdist: From 028da0830c3baa8c6b80bd7ef25255fec073b0c2 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 14:35:56 -0600 Subject: [PATCH 46/85] swag https://cibuildwheel.readthedocs.io/en/stable/options/#archs --- .github/workflows/python_build.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index e345f15f..4bdfd844 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -63,6 +63,7 @@ jobs: run: python -m cibuildwheel ${{ steps.sdist_glob.outputs.paths }} --output-dir ./dist env: CIBW_BUILD: ${{ matrix.cibw_build }} + CIBW_ARCHS: all CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel disagrees with CPython 3.6 in some way # FIXME? PQClean GNU extensions break musl From 9588b52b765e617295d9e6dcdb05f090b02fcffe Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 14:47:39 -0600 Subject: [PATCH 47/85] Revert "CI: run on pull requests" This reverts commit 666a4643f908713f1a52fe66087af35cbd8f501c. It turned out to be a bit abusive, since Github doesn't deduplicate such actions; every time I pushed to a PR-eligible branch, it would build the whole project TWICE. That's just a waste of resources. We need to get some kind of test suite which is run before the sdist action is deemed a success, and run *that* on each commit; then we could consider not running the full bdist suite on every commit, but only on PRs? --- .github/workflows/python_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 4bdfd844..2d09a41a 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -1,6 +1,6 @@ name: Build -on: [pull_request, push] +on: [push] jobs: sdist: From c2190e69c09e8c17232441b9f7af457f742d6d8c Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 14:54:39 -0600 Subject: [PATCH 48/85] CI: more useful multi-platform errors --- .github/workflows/python_build.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 2d09a41a..450a3bad 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -27,6 +27,7 @@ jobs: matrix: py-impl: ["CPython 3.X", "PyPy 3.X"] os: [Windows, Mac, Linux] + fail-fast: false include: - py-impl: "CPython 3.X" From 64e65e0892025775f7e467e8eb55e8e09793832c Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 15:13:25 -0600 Subject: [PATCH 49/85] Break CIBW_SKIP into multiline To make git blame better --- .github/workflows/python_build.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 450a3bad..26aed4bd 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -68,7 +68,9 @@ jobs: CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel disagrees with CPython 3.6 in some way # FIXME? PQClean GNU extensions break musl - CIBW_SKIP: "cp36-* *-musllinux_*" + CIBW_SKIP: > + cp36-* + *-musllinux_* # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" From 3da16e223b1f0d65592c057a2619a67f362adfc7 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 15:13:50 -0600 Subject: [PATCH 50/85] CI: disable CPython Windows ARM --- .github/workflows/python_build.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 26aed4bd..7ffedb6e 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -68,9 +68,11 @@ jobs: CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel disagrees with CPython 3.6 in some way # FIXME? PQClean GNU extensions break musl + # FIXME? delvewheel chokes specifically on CPython on Windows on ARM CIBW_SKIP: > cp36-* *-musllinux_* + cp*-win*arm* # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" From 04b4d2a06ec1d3d01e40505033693df415af8713 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 15:19:04 -0600 Subject: [PATCH 51/85] Annoying YAML https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategyfail-fast --- .github/workflows/python_build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 7ffedb6e..0bbb44bf 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -24,10 +24,10 @@ jobs: needs: sdist runs-on: ${{ matrix.github_os }} strategy: + fail-fast: false matrix: py-impl: ["CPython 3.X", "PyPy 3.X"] os: [Windows, Mac, Linux] - fail-fast: false include: - py-impl: "CPython 3.X" From cef5636f801af2266f26fc4b9db59b268c229101 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 15:57:51 -0600 Subject: [PATCH 52/85] Normalize code --- pqc/kem/hqc_128.py | 42 +++++++++++++-------------- pqc/kem/hqc_192.py | 42 +++++++++++++-------------- pqc/kem/hqc_256.py | 42 +++++++++++++-------------- pqc/kem/kyber1024.py | 42 +++++++++++++-------------- pqc/kem/kyber512.py | 42 +++++++++++++-------------- pqc/kem/kyber768.py | 42 +++++++++++++-------------- pqc/kem/mceliece348864.py | 42 +++++++++++++-------------- pqc/kem/mceliece460896.py | 42 +++++++++++++-------------- pqc/kem/mceliece6688128.py | 42 +++++++++++++-------------- pqc/kem/mceliece6960119.py | 42 +++++++++++++-------------- pqc/kem/mceliece8192128.py | 42 +++++++++++++-------------- pqc/sign/dilithium2.py | 32 ++++++++------------ pqc/sign/dilithium3.py | 32 ++++++++------------ pqc/sign/dilithium5.py | 32 ++++++++------------ pqc/sign/falcon_1024.py | 32 ++++++++------------ pqc/sign/falcon_512.py | 32 ++++++++------------ pqc/sign/sphincs_sha2_128f_simple.py | 32 ++++++++------------ pqc/sign/sphincs_sha2_128s_simple.py | 32 ++++++++------------ pqc/sign/sphincs_sha2_192f_simple.py | 32 ++++++++------------ pqc/sign/sphincs_sha2_192s_simple.py | 32 ++++++++------------ pqc/sign/sphincs_sha2_256f_simple.py | 32 ++++++++------------ pqc/sign/sphincs_sha2_256s_simple.py | 32 ++++++++------------ pqc/sign/sphincs_shake_128f_simple.py | 32 ++++++++------------ pqc/sign/sphincs_shake_128s_simple.py | 32 ++++++++------------ pqc/sign/sphincs_shake_192f_simple.py | 32 ++++++++------------ pqc/sign/sphincs_shake_192s_simple.py | 32 ++++++++------------ pqc/sign/sphincs_shake_256f_simple.py | 32 ++++++++------------ pqc/sign/sphincs_shake_256s_simple.py | 32 ++++++++------------ 28 files changed, 424 insertions(+), 582 deletions(-) diff --git a/pqc/kem/hqc_128.py b/pqc/kem/hqc_128.py index 4db2fda4..6fd59ce0 100644 --- a/pqc/kem/hqc_128.py +++ b/pqc/kem/hqc_128.py @@ -22,38 +22,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/hqc_192.py b/pqc/kem/hqc_192.py index 6df50ca6..35c747e0 100644 --- a/pqc/kem/hqc_192.py +++ b/pqc/kem/hqc_192.py @@ -22,38 +22,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/hqc_256.py b/pqc/kem/hqc_256.py index 5db868a8..12a72881 100644 --- a/pqc/kem/hqc_256.py +++ b/pqc/kem/hqc_256.py @@ -22,38 +22,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/kyber1024.py b/pqc/kem/kyber1024.py index be901ec5..aade152f 100644 --- a/pqc/kem/kyber1024.py +++ b/pqc/kem/kyber1024.py @@ -26,38 +26,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/kyber512.py b/pqc/kem/kyber512.py index e22f0cd1..a010cfbf 100644 --- a/pqc/kem/kyber512.py +++ b/pqc/kem/kyber512.py @@ -26,38 +26,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/kyber768.py b/pqc/kem/kyber768.py index 3b25e067..d3aa5f0a 100644 --- a/pqc/kem/kyber768.py +++ b/pqc/kem/kyber768.py @@ -26,38 +26,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/mceliece348864.py b/pqc/kem/mceliece348864.py index 28a38d81..8447814d 100644 --- a/pqc/kem/mceliece348864.py +++ b/pqc/kem/mceliece348864.py @@ -18,38 +18,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/mceliece460896.py b/pqc/kem/mceliece460896.py index d8742ee6..db7ec289 100644 --- a/pqc/kem/mceliece460896.py +++ b/pqc/kem/mceliece460896.py @@ -18,38 +18,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/mceliece6688128.py b/pqc/kem/mceliece6688128.py index 08bad48f..3da049b2 100644 --- a/pqc/kem/mceliece6688128.py +++ b/pqc/kem/mceliece6688128.py @@ -18,38 +18,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/mceliece6960119.py b/pqc/kem/mceliece6960119.py index d3024500..0f5e87d8 100644 --- a/pqc/kem/mceliece6960119.py +++ b/pqc/kem/mceliece6960119.py @@ -18,38 +18,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/kem/mceliece8192128.py b/pqc/kem/mceliece8192128.py index 602a54cc..489512ec 100644 --- a/pqc/kem/mceliece8192128.py +++ b/pqc/kem/mceliece8192128.py @@ -18,38 +18,36 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - if errno: - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") + errno = _crypto_kem_dec(_ss, _ct, _sk) - return bytes(key) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") diff --git a/pqc/sign/dilithium2.py b/pqc/sign/dilithium2.py index 1920d71a..7e3c0ccf 100644 --- a/pqc/sign/dilithium2.py +++ b/pqc/sign/dilithium2.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/dilithium3.py b/pqc/sign/dilithium3.py index 53bdf551..27f50039 100644 --- a/pqc/sign/dilithium3.py +++ b/pqc/sign/dilithium3.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/dilithium5.py b/pqc/sign/dilithium5.py index e9f7ee17..51e1cf87 100644 --- a/pqc/sign/dilithium5.py +++ b/pqc/sign/dilithium5.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/falcon_1024.py b/pqc/sign/falcon_1024.py index 61d957fc..90c5ced9 100644 --- a/pqc/sign/falcon_1024.py +++ b/pqc/sign/falcon_1024.py @@ -26,42 +26,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sigbuf = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) - _sig = _sigbuf[0:_siglen[0]] # Variable-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + _sig = _sigbuf[0:_siglen[0]] # Variable-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.from_buffer(sig) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/falcon_512.py b/pqc/sign/falcon_512.py index 8728b47a..25ef28f3 100644 --- a/pqc/sign/falcon_512.py +++ b/pqc/sign/falcon_512.py @@ -26,42 +26,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sigbuf = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) - _sig = _sigbuf[0:_siglen[0]] # Variable-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + _sig = _sigbuf[0:_siglen[0]] # Variable-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.from_buffer(sig) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_sha2_128f_simple.py b/pqc/sign/sphincs_sha2_128f_simple.py index 100743d5..d09fbb84 100644 --- a/pqc/sign/sphincs_sha2_128f_simple.py +++ b/pqc/sign/sphincs_sha2_128f_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_sha2_128s_simple.py b/pqc/sign/sphincs_sha2_128s_simple.py index a8f8e3ad..f0547547 100644 --- a/pqc/sign/sphincs_sha2_128s_simple.py +++ b/pqc/sign/sphincs_sha2_128s_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_sha2_192f_simple.py b/pqc/sign/sphincs_sha2_192f_simple.py index 2e08207f..3ffc4705 100644 --- a/pqc/sign/sphincs_sha2_192f_simple.py +++ b/pqc/sign/sphincs_sha2_192f_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_sha2_192s_simple.py b/pqc/sign/sphincs_sha2_192s_simple.py index 9c883781..10df2c44 100644 --- a/pqc/sign/sphincs_sha2_192s_simple.py +++ b/pqc/sign/sphincs_sha2_192s_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_sha2_256f_simple.py b/pqc/sign/sphincs_sha2_256f_simple.py index 5b74fe15..ca87462b 100644 --- a/pqc/sign/sphincs_sha2_256f_simple.py +++ b/pqc/sign/sphincs_sha2_256f_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_sha2_256s_simple.py b/pqc/sign/sphincs_sha2_256s_simple.py index fa17fae4..0111162f 100644 --- a/pqc/sign/sphincs_sha2_256s_simple.py +++ b/pqc/sign/sphincs_sha2_256s_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_shake_128f_simple.py b/pqc/sign/sphincs_shake_128f_simple.py index 2a4f7fbc..8189dea4 100644 --- a/pqc/sign/sphincs_shake_128f_simple.py +++ b/pqc/sign/sphincs_shake_128f_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_shake_128s_simple.py b/pqc/sign/sphincs_shake_128s_simple.py index 4487688e..95831ff9 100644 --- a/pqc/sign/sphincs_shake_128s_simple.py +++ b/pqc/sign/sphincs_shake_128s_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_shake_192f_simple.py b/pqc/sign/sphincs_shake_192f_simple.py index a53be3dc..13879de9 100644 --- a/pqc/sign/sphincs_shake_192f_simple.py +++ b/pqc/sign/sphincs_shake_192f_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_shake_192s_simple.py b/pqc/sign/sphincs_shake_192s_simple.py index f7762d78..749ae732 100644 --- a/pqc/sign/sphincs_shake_192s_simple.py +++ b/pqc/sign/sphincs_shake_192s_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_shake_256f_simple.py b/pqc/sign/sphincs_shake_256f_simple.py index 37e07f23..daee74e7 100644 --- a/pqc/sign/sphincs_shake_256f_simple.py +++ b/pqc/sign/sphincs_shake_256f_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") diff --git a/pqc/sign/sphincs_shake_256s_simple.py b/pqc/sign/sphincs_shake_256s_simple.py index 68f39201..56482892 100644 --- a/pqc/sign/sphincs_shake_256s_simple.py +++ b/pqc/sign/sphincs_shake_256s_simple.py @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # Fixed-length signature - if errno: - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") From 4c618f6187503ba55e2bc77aea06b0d5eaa39670 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 16:01:40 -0600 Subject: [PATCH 53/85] Ruff: linter --- pqc/_util.py | 3 --- pqc/kem/mceliece348864.py | 2 +- pqc/kem/mceliece460896.py | 2 +- pqc/kem/mceliece6688128.py | 2 +- pqc/kem/mceliece6960119.py | 2 +- pqc/kem/mceliece8192128.py | 2 +- 6 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pqc/_util.py b/pqc/_util.py index 8bc87a9b..1360b342 100644 --- a/pqc/_util.py +++ b/pqc/_util.py @@ -1,8 +1,5 @@ from collections import deque -from functools import partial from itertools import starmap -from pathlib import Path -import platform import re from textwrap import dedent from warnings import warn diff --git a/pqc/kem/mceliece348864.py b/pqc/kem/mceliece348864.py index 8447814d..75c65685 100644 --- a/pqc/kem/mceliece348864.py +++ b/pqc/kem/mceliece348864.py @@ -1,6 +1,6 @@ from .._lib.libmceliece348864f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' diff --git a/pqc/kem/mceliece460896.py b/pqc/kem/mceliece460896.py index db7ec289..7d54ea49 100644 --- a/pqc/kem/mceliece460896.py +++ b/pqc/kem/mceliece460896.py @@ -1,6 +1,6 @@ from .._lib.libmceliece460896f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' diff --git a/pqc/kem/mceliece6688128.py b/pqc/kem/mceliece6688128.py index 3da049b2..543c6042 100644 --- a/pqc/kem/mceliece6688128.py +++ b/pqc/kem/mceliece6688128.py @@ -1,6 +1,6 @@ from .._lib.libmceliece6688128f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' diff --git a/pqc/kem/mceliece6960119.py b/pqc/kem/mceliece6960119.py index 0f5e87d8..521f31e4 100644 --- a/pqc/kem/mceliece6960119.py +++ b/pqc/kem/mceliece6960119.py @@ -1,6 +1,6 @@ from .._lib.libmceliece6960119f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' diff --git a/pqc/kem/mceliece8192128.py b/pqc/kem/mceliece8192128.py index 489512ec..1d122954 100644 --- a/pqc/kem/mceliece8192128.py +++ b/pqc/kem/mceliece8192128.py @@ -1,6 +1,6 @@ from .._lib.libmceliece8192128f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' From 5b9de8f244ad788a4555c8d3cf633689edac412f Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 16:17:19 -0600 Subject: [PATCH 54/85] Ruff: format --- pqc/demo.py | 4 +- pqc/kem/hqc_128.py | 10 ++- pqc/kem/hqc_192.py | 10 ++- pqc/kem/hqc_256.py | 10 ++- pqc/kem/kyber1024.py | 10 ++- pqc/kem/kyber512.py | 10 ++- pqc/kem/kyber768.py | 10 ++- pqc/kem/mceliece348864.py | 7 +- pqc/kem/mceliece460896.py | 7 +- pqc/kem/mceliece6688128.py | 7 +- pqc/kem/mceliece6960119.py | 7 +- pqc/kem/mceliece8192128.py | 7 +- pqc/sign/dilithium2.py | 6 +- pqc/sign/dilithium3.py | 6 +- pqc/sign/dilithium5.py | 6 +- pqc/sign/falcon_1024.py | 11 ++- pqc/sign/falcon_512.py | 11 ++- pqc/sign/sphincs_sha2_128f_simple.py | 6 +- pqc/sign/sphincs_sha2_128s_simple.py | 6 +- pqc/sign/sphincs_sha2_192f_simple.py | 6 +- pqc/sign/sphincs_sha2_192s_simple.py | 6 +- pqc/sign/sphincs_sha2_256f_simple.py | 6 +- pqc/sign/sphincs_sha2_256s_simple.py | 6 +- pqc/sign/sphincs_shake_128f_simple.py | 6 +- pqc/sign/sphincs_shake_128s_simple.py | 6 +- pqc/sign/sphincs_shake_192f_simple.py | 6 +- pqc/sign/sphincs_shake_192s_simple.py | 6 +- pqc/sign/sphincs_shake_256f_simple.py | 6 +- pqc/sign/sphincs_shake_256s_simple.py | 6 +- pyproject.toml | 4 + setup.py | 121 +++++++++++++++----------- 31 files changed, 184 insertions(+), 152 deletions(-) diff --git a/pqc/demo.py b/pqc/demo.py index edefb311..59bff24a 100644 --- a/pqc/demo.py +++ b/pqc/demo.py @@ -6,5 +6,5 @@ test_decrypted = mceliece6960119.kem_dec(test_ciphertext, secret_key) if test_key != test_decrypted: - raise AssertionError("fail :(") - print("OK") + raise AssertionError('fail :(') + print('OK') diff --git a/pqc/kem/hqc_128.py b/pqc/kem/hqc_128.py index 6fd59ce0..b000e370 100644 --- a/pqc/kem/hqc_128.py +++ b/pqc/kem/hqc_128.py @@ -1,12 +1,15 @@ from .._lib.libhqc_128_clean import ffi, lib import os + if os.environ.get('LICENSED_HQC', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], 'the HQC cryptosystem', 3, ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] ) + # fmt: on __all__ = ['keypair', 'encap', 'decap'] @@ -29,7 +32,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -41,7 +44,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -53,5 +56,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/hqc_192.py b/pqc/kem/hqc_192.py index 35c747e0..dda202f8 100644 --- a/pqc/kem/hqc_192.py +++ b/pqc/kem/hqc_192.py @@ -1,12 +1,15 @@ from .._lib.libhqc_192_clean import ffi, lib import os + if os.environ.get('LICENSED_HQC', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], 'the HQC cryptosystem', 3, ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] ) + # fmt: on __all__ = ['keypair', 'encap', 'decap'] @@ -29,7 +32,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -41,7 +44,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -53,5 +56,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/hqc_256.py b/pqc/kem/hqc_256.py index 12a72881..578a6965 100644 --- a/pqc/kem/hqc_256.py +++ b/pqc/kem/hqc_256.py @@ -1,12 +1,15 @@ from .._lib.libhqc_256_clean import ffi, lib import os + if os.environ.get('LICENSED_HQC', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], 'the HQC cryptosystem', 3, ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] ) + # fmt: on __all__ = ['keypair', 'encap', 'decap'] @@ -29,7 +32,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -41,7 +44,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -53,5 +56,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/kyber1024.py b/pqc/kem/kyber1024.py index aade152f..e37a8d59 100644 --- a/pqc/kem/kyber1024.py +++ b/pqc/kem/kyber1024.py @@ -1,7 +1,9 @@ from .._lib.libkyber1024_clean import ffi, lib import os + if os.environ.get('LICENSED_KYBER', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], 'the Kyber cryptosystem', 1, [ @@ -11,6 +13,7 @@ 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] ) + # fmt: on __all__ = ['keypair', 'encap', 'decap'] @@ -33,7 +36,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -45,7 +48,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -57,5 +60,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/kyber512.py b/pqc/kem/kyber512.py index a010cfbf..8103ae69 100644 --- a/pqc/kem/kyber512.py +++ b/pqc/kem/kyber512.py @@ -1,7 +1,9 @@ from .._lib.libkyber512_clean import ffi, lib import os + if os.environ.get('LICENSED_KYBER', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], 'the Kyber cryptosystem', 1, [ @@ -11,6 +13,7 @@ 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] ) + # fmt: on __all__ = ['keypair', 'encap', 'decap'] @@ -33,7 +36,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -45,7 +48,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -57,5 +60,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/kyber768.py b/pqc/kem/kyber768.py index d3aa5f0a..50da301b 100644 --- a/pqc/kem/kyber768.py +++ b/pqc/kem/kyber768.py @@ -1,7 +1,9 @@ from .._lib.libkyber768_clean import ffi, lib import os + if os.environ.get('LICENSED_KYBER', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], 'the Kyber cryptosystem', 1, [ @@ -11,6 +13,7 @@ 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] ) + # fmt: on __all__ = ['keypair', 'encap', 'decap'] @@ -33,7 +36,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -45,7 +48,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -57,5 +60,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece348864.py b/pqc/kem/mceliece348864.py index 75c65685..fccf2e0f 100644 --- a/pqc/kem/mceliece348864.py +++ b/pqc/kem/mceliece348864.py @@ -25,7 +25,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -37,7 +37,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -49,5 +49,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece460896.py b/pqc/kem/mceliece460896.py index 7d54ea49..2c688617 100644 --- a/pqc/kem/mceliece460896.py +++ b/pqc/kem/mceliece460896.py @@ -25,7 +25,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -37,7 +37,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -49,5 +49,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece6688128.py b/pqc/kem/mceliece6688128.py index 543c6042..261e6c9c 100644 --- a/pqc/kem/mceliece6688128.py +++ b/pqc/kem/mceliece6688128.py @@ -25,7 +25,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -37,7 +37,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -49,5 +49,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece6960119.py b/pqc/kem/mceliece6960119.py index 521f31e4..62bcac43 100644 --- a/pqc/kem/mceliece6960119.py +++ b/pqc/kem/mceliece6960119.py @@ -25,7 +25,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -37,7 +37,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -49,5 +49,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece8192128.py b/pqc/kem/mceliece8192128.py index 1d122954..edf078df 100644 --- a/pqc/kem/mceliece8192128.py +++ b/pqc/kem/mceliece8192128.py @@ -25,7 +25,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_kem_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): @@ -37,7 +37,7 @@ def encap(pk): if errno == 0: return bytes(_ss), bytes(_ct) - raise RuntimeError(f"{_crypto_kem_enc.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): @@ -49,5 +49,4 @@ def decap(ciphertext, sk): if errno == 0: return bytes(_ss) - raise RuntimeError(f"{_crypto_kem_dec.__name__} returned error code {errno}") - + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/sign/dilithium2.py b/pqc/sign/dilithium2.py index 7e3c0ccf..b3a43433 100644 --- a/pqc/sign/dilithium2.py +++ b/pqc/sign/dilithium2.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/dilithium3.py b/pqc/sign/dilithium3.py index 27f50039..1ee2044a 100644 --- a/pqc/sign/dilithium3.py +++ b/pqc/sign/dilithium3.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/dilithium5.py b/pqc/sign/dilithium5.py index 51e1cf87..c90a6948 100644 --- a/pqc/sign/dilithium5.py +++ b/pqc/sign/dilithium5.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/falcon_1024.py b/pqc/sign/falcon_1024.py index 90c5ced9..c98ef67a 100644 --- a/pqc/sign/falcon_1024.py +++ b/pqc/sign/falcon_1024.py @@ -1,12 +1,15 @@ from .._lib.libfalcon_1024_clean import ffi, lib import os + if os.environ.get('LICENSED_FALCON', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['US7308097B2'], 'the Falcon cryptosystem', 2, ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] ) + # fmt: on __all__ = ['keypair', 'sign', 'verify'] @@ -28,7 +31,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -40,9 +43,9 @@ def sign(m, sk): errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) if errno == 0: - _sig = _sigbuf[0:_siglen[0]] # Variable-length signature + _sig = _sigbuf[0 : _siglen[0]] # Variable-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -56,4 +59,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/falcon_512.py b/pqc/sign/falcon_512.py index 25ef28f3..c658e9a6 100644 --- a/pqc/sign/falcon_512.py +++ b/pqc/sign/falcon_512.py @@ -1,12 +1,15 @@ from .._lib.libfalcon_512_clean import ffi, lib import os + if os.environ.get('LICENSED_FALCON', '0') == '0': + # fmt: off from .._util import patent_notice patent_notice(['US7308097B2'], 'the Falcon cryptosystem', 2, ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] ) + # fmt: on __all__ = ['keypair', 'sign', 'verify'] @@ -28,7 +31,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -40,9 +43,9 @@ def sign(m, sk): errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) if errno == 0: - _sig = _sigbuf[0:_siglen[0]] # Variable-length signature + _sig = _sigbuf[0 : _siglen[0]] # Variable-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -56,4 +59,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_128f_simple.py b/pqc/sign/sphincs_sha2_128f_simple.py index d09fbb84..0a1c4b0c 100644 --- a/pqc/sign/sphincs_sha2_128f_simple.py +++ b/pqc/sign/sphincs_sha2_128f_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_128s_simple.py b/pqc/sign/sphincs_sha2_128s_simple.py index f0547547..b1b36ae5 100644 --- a/pqc/sign/sphincs_sha2_128s_simple.py +++ b/pqc/sign/sphincs_sha2_128s_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_192f_simple.py b/pqc/sign/sphincs_sha2_192f_simple.py index 3ffc4705..0ff057d3 100644 --- a/pqc/sign/sphincs_sha2_192f_simple.py +++ b/pqc/sign/sphincs_sha2_192f_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_192s_simple.py b/pqc/sign/sphincs_sha2_192s_simple.py index 10df2c44..8040aa55 100644 --- a/pqc/sign/sphincs_sha2_192s_simple.py +++ b/pqc/sign/sphincs_sha2_192s_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_256f_simple.py b/pqc/sign/sphincs_sha2_256f_simple.py index ca87462b..69aeb17e 100644 --- a/pqc/sign/sphincs_sha2_256f_simple.py +++ b/pqc/sign/sphincs_sha2_256f_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_256s_simple.py b/pqc/sign/sphincs_sha2_256s_simple.py index 0111162f..b4ce53bb 100644 --- a/pqc/sign/sphincs_sha2_256s_simple.py +++ b/pqc/sign/sphincs_sha2_256s_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_128f_simple.py b/pqc/sign/sphincs_shake_128f_simple.py index 8189dea4..da90937d 100644 --- a/pqc/sign/sphincs_shake_128f_simple.py +++ b/pqc/sign/sphincs_shake_128f_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_128s_simple.py b/pqc/sign/sphincs_shake_128s_simple.py index 95831ff9..8d2b726d 100644 --- a/pqc/sign/sphincs_shake_128s_simple.py +++ b/pqc/sign/sphincs_shake_128s_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_192f_simple.py b/pqc/sign/sphincs_shake_192f_simple.py index 13879de9..39287aff 100644 --- a/pqc/sign/sphincs_shake_192f_simple.py +++ b/pqc/sign/sphincs_shake_192f_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_192s_simple.py b/pqc/sign/sphincs_shake_192s_simple.py index 749ae732..6c70b8c3 100644 --- a/pqc/sign/sphincs_shake_192s_simple.py +++ b/pqc/sign/sphincs_shake_192s_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_256f_simple.py b/pqc/sign/sphincs_shake_256f_simple.py index daee74e7..368e5c00 100644 --- a/pqc/sign/sphincs_shake_256f_simple.py +++ b/pqc/sign/sphincs_shake_256f_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_256s_simple.py b/pqc/sign/sphincs_shake_256s_simple.py index 56482892..16c31336 100644 --- a/pqc/sign/sphincs_shake_256s_simple.py +++ b/pqc/sign/sphincs_shake_256s_simple.py @@ -20,7 +20,7 @@ def keypair(): if errno == 0: return bytes(_pk), bytes(_sk) - raise RuntimeError(f"{_crypto_sign_keypair.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): @@ -34,7 +34,7 @@ def sign(m, sk): if errno == 0: assert len(_sig) == _siglen[0] # Fixed-length signature return bytes(_sig) - raise RuntimeError(f"{_crypto_sign_signature.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): @@ -48,4 +48,4 @@ def verify(sig, m, pk): return if errno == -1: raise ValueError('verification failed') - raise RuntimeError(f"{_crypto_sign_verify.__name__} returned error code {errno}") + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pyproject.toml b/pyproject.toml index cca359c8..baad2edc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,3 +18,7 @@ requires = [ 'setuptools >= 49.5.0', 'wheel >= 0.30.0', ] + +[tool.ruff.format] +quote-style = "single" +indent-style = "tab" diff --git a/setup.py b/setup.py index 79841898..fbc8b8d9 100644 --- a/setup.py +++ b/setup.py @@ -8,66 +8,81 @@ # Pending https://hpyproject.org/ ABI3_EXCLUDE_IMPLEMENTATIONS = { - 'PyPy', # https://github.com/orgs/pypy/discussions/4884#discussioncomment-8309845 + 'PyPy', # https://github.com/orgs/pypy/discussions/4884#discussioncomment-8309845 } class site_bdist_wheel(_bdist_wheel): - """https://github.com/joerick/python-ctypes-package-sample/blob/7db688cd6ee32ae95bce0f75fb7d806926e20252/setup.py#L29""" + """https://github.com/joerick/python-ctypes-package-sample/blob/7db688cd6ee32ae95bce0f75fb7d806926e20252/setup.py#L29""" - def finalize_options(self): - # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L244 - if (platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS - # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L267 - and (self.distribution.has_ext_modules() or self.distribution.has_c_libraries()) - # https://github.com/pypa/setuptools/blob/v69.0.3/setuptools/command/build_ext.py#L160 - and all(ext.py_limited_api for ext in self.distribution.ext_modules) - ): - self.py_limited_api = f'cp{sys.version_info.major}{sys.version_info.minor}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}{sys.version_info.minor}' - super().finalize_options() + def finalize_options(self): + # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L244 + if ( + platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS + # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L267 + and ( + self.distribution.has_ext_modules() + or self.distribution.has_c_libraries() + ) + # https://github.com/pypa/setuptools/blob/v69.0.3/setuptools/command/build_ext.py#L160 + and all(ext.py_limited_api for ext in self.distribution.ext_modules) + ): + self.py_limited_api = ( + f'cp{sys.version_info.major}{sys.version_info.minor}' + if platform.python_implementation() == 'CPython' + else f'py{sys.version_info.major}{sys.version_info.minor}' + ) + super().finalize_options() - def get_tag(self): - python, abi, plat = _bdist_wheel.get_tag(self) - if self.py_limited_api and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS: - # https://github.com/pypa/cibuildwheel/blob/v2.16.3/cibuildwheel/util.py#L653 - python = f'cp{sys.version_info.major}{sys.version_info.minor}' if platform.python_implementation() == 'CPython' else f'py{sys.version_info.major}{sys.version_info.minor}' - abi = f'abi{sys.version_info.major}' - return python, abi, plat + def get_tag(self): + python, abi, plat = _bdist_wheel.get_tag(self) + if ( + self.py_limited_api + and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS + ): + # https://github.com/pypa/cibuildwheel/blob/v2.16.3/cibuildwheel/util.py#L653 + python = ( + f'cp{sys.version_info.major}{sys.version_info.minor}' + if platform.python_implementation() == 'CPython' + else f'py{sys.version_info.major}{sys.version_info.minor}' + ) + abi = f'abi{sys.version_info.major}' + return python, abi, plat setup( - cmdclass={"bdist_wheel": site_bdist_wheel}, - cffi_modules=[ - 'cffi_modules/dilithium2_clean.py:ffi', - 'cffi_modules/dilithium3_clean.py:ffi', - 'cffi_modules/dilithium5_clean.py:ffi', - 'cffi_modules/falcon_512_clean.py:ffi', - 'cffi_modules/falcon_1024_clean.py:ffi', - 'cffi_modules/hqc_128_clean.py:ffi', - 'cffi_modules/hqc_192_clean.py:ffi', - 'cffi_modules/hqc_256_clean.py:ffi', - 'cffi_modules/kyber512_clean.py:ffi', - 'cffi_modules/kyber768_clean.py:ffi', - 'cffi_modules/kyber1024_clean.py:ffi', - 'cffi_modules/mceliece348864f_clean.py:ffi', - 'cffi_modules/mceliece460896f_clean.py:ffi', - 'cffi_modules/mceliece6688128f_clean.py:ffi', - 'cffi_modules/mceliece6960119f_clean.py:ffi', - 'cffi_modules/mceliece8192128f_clean.py:ffi', -# 'cffi_modules/mceliece6688128pcf_clean.py:ffi', -# 'cffi_modules/mceliece6960119pcf_clean.py:ffi', -# 'cffi_modules/mceliece8192128pcf_clean.py:ffi', - 'cffi_modules/sphincs-sha2-128f-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-128s-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-192f-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-192s-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-256f-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-256s-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-128f-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-128s-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-192f-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-192s-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-256f-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-256s-simple_clean.py:ffi', - ], + cmdclass={'bdist_wheel': site_bdist_wheel}, + cffi_modules=[ + 'cffi_modules/dilithium2_clean.py:ffi', + 'cffi_modules/dilithium3_clean.py:ffi', + 'cffi_modules/dilithium5_clean.py:ffi', + 'cffi_modules/falcon_512_clean.py:ffi', + 'cffi_modules/falcon_1024_clean.py:ffi', + 'cffi_modules/hqc_128_clean.py:ffi', + 'cffi_modules/hqc_192_clean.py:ffi', + 'cffi_modules/hqc_256_clean.py:ffi', + 'cffi_modules/kyber512_clean.py:ffi', + 'cffi_modules/kyber768_clean.py:ffi', + 'cffi_modules/kyber1024_clean.py:ffi', + 'cffi_modules/mceliece348864f_clean.py:ffi', + 'cffi_modules/mceliece460896f_clean.py:ffi', + 'cffi_modules/mceliece6688128f_clean.py:ffi', + 'cffi_modules/mceliece6960119f_clean.py:ffi', + 'cffi_modules/mceliece8192128f_clean.py:ffi', + ##'cffi_modules/mceliece6688128pcf_clean.py:ffi', + ##'cffi_modules/mceliece6960119pcf_clean.py:ffi', + ##'cffi_modules/mceliece8192128pcf_clean.py:ffi', + 'cffi_modules/sphincs-sha2-128f-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-128s-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-192f-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-192s-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-256f-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-256s-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-128f-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-128s-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-192f-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-192s-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-256f-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-256s-simple_clean.py:ffi', + ], ) From f492550c9962bd8066d8988cd82c96d1fa3d076b Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 16:26:55 -0600 Subject: [PATCH 55/85] CI: rebalance build weird lopsided build takes an HOUR as previously configured --- .github/workflows/python_build.yaml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 0bbb44bf..02cb94ca 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -27,7 +27,7 @@ jobs: fail-fast: false matrix: py-impl: ["CPython 3.X", "PyPy 3.X"] - os: [Windows, Mac, Linux] + os: [Windows, Mac, Linux, "Linux (Extra Architectures)"] include: - py-impl: "CPython 3.X" @@ -37,10 +37,16 @@ jobs: # Use the oldest OSes available for compatibility - os: Windows github_os: windows-2019 + cibw_archs: AMD64 x86 ARM64 - os: Mac github_os: macos-11 + cibw_archs: x86_64 arm64 universal2 - os: Linux github_os: ubuntu-20.04 + cibw_archs: x86_64 i686 aarch64 + - os: Linux (Extra Architectures) + github_os: ubuntu-20.04 + cibw_archs: ppc64le s390x steps: - uses: actions/download-artifact@v4 @@ -64,7 +70,7 @@ jobs: run: python -m cibuildwheel ${{ steps.sdist_glob.outputs.paths }} --output-dir ./dist env: CIBW_BUILD: ${{ matrix.cibw_build }} - CIBW_ARCHS: all + CIBW_ARCHS: ${{ matrix.cibw_archs }} CIBW_BUILD_VERBOSITY: 1 # FIXME? cibuildwheel disagrees with CPython 3.6 in some way # FIXME? PQClean GNU extensions break musl From 8c779a5c6603ba196a7a159110db464d1574e008 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 16:30:48 -0600 Subject: [PATCH 56/85] CI: exclude PyPy from uncommon architectures --- .github/workflows/python_build.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index 02cb94ca..d6d4008d 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -48,6 +48,11 @@ jobs: github_os: ubuntu-20.04 cibw_archs: ppc64le s390x + exclude: + - os: Linux (Extra Architectures) + # FIXME? cibuildwheel disagrees with this + py-impl: "PyPy 3.X" + steps: - uses: actions/download-artifact@v4 with: From 6f93f870104c88e923cd9399b0fa9c3decdf8e8f Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 31 Jan 2024 16:58:25 -0600 Subject: [PATCH 57/85] CI: fix Linux extra architectures --- .github/workflows/python_build.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml index d6d4008d..6d9af4a3 100644 --- a/.github/workflows/python_build.yaml +++ b/.github/workflows/python_build.yaml @@ -43,10 +43,10 @@ jobs: cibw_archs: x86_64 arm64 universal2 - os: Linux github_os: ubuntu-20.04 - cibw_archs: x86_64 i686 aarch64 + cibw_archs: x86_64 i686 - os: Linux (Extra Architectures) github_os: ubuntu-20.04 - cibw_archs: ppc64le s390x + cibw_archs: aarch64 ppc64le s390x exclude: - os: Linux (Extra Architectures) @@ -67,6 +67,9 @@ jobs: # Used to host cibuildwheel - uses: actions/setup-python@v3 + - if: matrix.os == 'Linux (Extra Architectures)' + uses: docker/setup-qemu-action@v3 + - name: Install cibuildwheel run: python -m pip install cibuildwheel==2.16.3 From f68c35545b435da68a689f70add2927de6479b86 Mon Sep 17 00:00:00 2001 From: Thom Wiggers Date: Mon, 5 Feb 2024 08:07:54 +0100 Subject: [PATCH 58/85] Apply Astyle 3.4.11 to signatures (#535) * Apply Astyle 3.4.10 to signatures * Apply AStyle 3.4.11 * Block another astyle version * Help debug astyle issues --- .../sphincs-sha2-128f-simple/avx2/hash_sha2.c | 4 +-- .../sphincs-sha2-128f-simple/avx2/sha256x8.c | 2 +- .../clean/hash_sha2.c | 4 +-- .../sphincs-sha2-128s-simple/avx2/hash_sha2.c | 4 +-- .../sphincs-sha2-128s-simple/avx2/sha256x8.c | 2 +- .../clean/hash_sha2.c | 4 +-- .../sphincs-sha2-192f-simple/avx2/hash_sha2.c | 4 +-- .../sphincs-sha2-192f-simple/avx2/sha256x8.c | 2 +- .../clean/hash_sha2.c | 4 +-- .../sphincs-sha2-192s-simple/avx2/hash_sha2.c | 4 +-- .../sphincs-sha2-192s-simple/avx2/sha256x8.c | 2 +- .../clean/hash_sha2.c | 4 +-- .../sphincs-sha2-256f-simple/avx2/hash_sha2.c | 4 +-- .../sphincs-sha2-256f-simple/avx2/sha256x8.c | 2 +- .../clean/hash_sha2.c | 4 +-- .../sphincs-sha2-256s-simple/avx2/hash_sha2.c | 4 +-- .../sphincs-sha2-256s-simple/avx2/sha256x8.c | 2 +- .../clean/hash_sha2.c | 4 +-- .../avx2/thash_shake_simplex4.c | 6 ++--- .../avx2/thash_shake_simplex4.c | 6 ++--- .../avx2/thash_shake_simplex4.c | 6 ++--- .../avx2/thash_shake_simplex4.c | 6 ++--- .../avx2/thash_shake_simplex4.c | 6 ++--- .../avx2/thash_shake_simplex4.c | 6 ++--- test/test_format.py | 25 ++++++++++++++++--- 25 files changed, 70 insertions(+), 51 deletions(-) diff --git a/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c b/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c index 32975338..a03540d3 100644 --- a/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c b/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c index d97750c0..d2afbb0c 100644 --- a/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c +++ b/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c b/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c index 32975338..a03540d3 100644 --- a/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c b/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c index 32975338..a03540d3 100644 --- a/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c b/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c index d97750c0..d2afbb0c 100644 --- a/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c +++ b/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c b/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c index 32975338..a03540d3 100644 --- a/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c b/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c b/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c index d97750c0..d2afbb0c 100644 --- a/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c +++ b/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c b/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c b/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c b/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c index d97750c0..d2afbb0c 100644 --- a/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c +++ b/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c b/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c b/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c b/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c index d97750c0..d2afbb0c 100644 --- a/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c +++ b/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c b/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c b/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c b/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c index d97750c0..d2afbb0c 100644 --- a/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c +++ b/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c b/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c index 5ba5e9cf..828558f0 100644 --- a/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c +++ b/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c b/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c index 89dc9a42..bbe04385 100644 --- a/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c +++ b/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c b/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c index 89dc9a42..bbe04385 100644 --- a/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c +++ b/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c b/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c index 89dc9a42..bbe04385 100644 --- a/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c +++ b/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c b/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c index 89dc9a42..bbe04385 100644 --- a/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c +++ b/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c b/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c index 89dc9a42..bbe04385 100644 --- a/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c +++ b/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c b/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c index 89dc9a42..bbe04385 100644 --- a/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c +++ b/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/test/test_format.py b/test/test_format.py index b07a23e8..474bfad5 100644 --- a/test/test_format.py +++ b/test/test_format.py @@ -1,6 +1,7 @@ import os import platform import pytest +import functools import helpers import pqclean @@ -10,18 +11,34 @@ if platform.machine() == "ppc": pytest.skip("Skipping this test on PowerPC to save cycles.", allow_module_level=True) + +__astyle_version_result = None + +def _get_astyle_version() -> str: + """Get the AStyle version number""" + # functools.lru_cache doesn't work because we want to print to stdout each time. + global __astyle_version_result + + if __astyle_version_result is None: + __astyle_version_result = helpers.run_subprocess(['astyle', '--version']) + else: + print(__astyle_version_result) + return __astyle_version_result + + helpers.ensure_available('astyle') # Check AStyle version def version_check(): - result = helpers.run_subprocess(['astyle', '--version']) + result = _get_astyle_version() if ("Artistic Style Version 3.4.8" in result or - "Artistic Style Version 3.4.9" in result): + "Artistic Style Version 3.4.9" in result or + "Artistic Style Version 3.4.10" in result): return False return "Artistic Style Version 3.4" in result if not version_check() and "CI" not in os.environ: - pytest.skip("Incompatible AStyle version (need 3.4.x) (not 3.4.{8,9})", allow_module_level=True) + pytest.skip("Incompatible AStyle version (need 3.4.x) (not 3.4.{8-10})", allow_module_level=True) @pytest.mark.parametrize( 'implementation', @@ -32,6 +49,8 @@ def version_check(): def test_format(implementation: pqclean.Implementation): cfiles = implementation.cfiles() hfiles = implementation.hfiles() + # Triggers a print + _get_astyle_version() result = helpers.run_subprocess( ['astyle', '--dry-run', From 07b1dd7b1406fae9b378c3ac8536020e8b0b2d92 Mon Sep 17 00:00:00 2001 From: Spencer Wilson Date: Mon, 5 Feb 2024 02:09:31 -0500 Subject: [PATCH 59/85] Add tests for variable-length signatures (#541) * Test for writing past the reported signature length * Use valgrind to detect behaviour that depends on the unused tail of the signature buffer. --- test/crypto_sign/functest.c | 64 +++++++++++++++++++++++++++++++++++++ test/test_valgrind.py | 2 +- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/test/crypto_sign/functest.c b/test/crypto_sign/functest.c index 8850f3ba..9f82423f 100644 --- a/test/crypto_sign/functest.c +++ b/test/crypto_sign/functest.c @@ -5,6 +5,9 @@ #include #include #include +#ifdef PQCLEAN_USE_VALGRIND +#include +#endif #ifndef NTESTS #define NTESTS 5 @@ -78,6 +81,15 @@ inline static void *malloc_s(size_t size) { #endif static int test_sign(void) { + /* + * In order to properly test variable-length signatures, we need to check + * that the implementation does not modify the provided buffer beyond the + * reported signature length. We do this by filling the buffer with random + * bytes before the call to sign and checking afterward that the tail has + * not been modified. + */ + uint8_t sm_random_cmp[MLEN + CRYPTO_BYTES]; + /* * This is most likely going to be aligned by the compiler. * 16 extra bytes for canary @@ -124,8 +136,29 @@ static int test_sign(void) { RETURNS_ZERO(crypto_sign_keypair(pk + 8, sk + 8)); randombytes(m + 8, MLEN); + // Fill the sm buffer with random bytes + randombytes(sm_random_cmp, MLEN + CRYPTO_BYTES); + memcpy(sm + 8, sm_random_cmp, MLEN + CRYPTO_BYTES); + +#ifdef PQCLEAN_USE_VALGRIND + /* + * With this buffer marked as undefined, valgrind will detect + * cases where the signing code depends on the value of the tail + * of the buffer. + */ + VALGRIND_MAKE_MEM_UNDEFINED(sm + 8, MLEN + CRYPTO_BYTES); +#endif + RETURNS_ZERO(crypto_sign(sm + 8, &smlen, m + 8, MLEN, sk + 8)); +#ifdef PQCLEAN_USE_VALGRIND + // We have to mark the tail as defined before doing the memcmp. + VALGRIND_MAKE_MEM_DEFINED(sm + 8 + smlen, MLEN + CRYPTO_BYTES - smlen); +#endif + + // check that the tail has not been modified + RETURNS_ZERO(memcmp(sm + 8 + smlen, sm_random_cmp + smlen, MLEN + CRYPTO_BYTES - smlen)); + // By relying on m == sm we prevent having to allocate CRYPTO_BYTES // twice if ((returncode = @@ -157,6 +190,15 @@ static int test_sign(void) { } static int test_sign_detached(void) { + /* + * In order to properly test variable-length signatures, we need to check + * that the implementation does not modify the provided buffer beyond the + * reported signature length. We do this by filling the buffer with random + * bytes before the call to sign and checking afterward that the tail has + * not been modified. + */ + uint8_t sig_random_cmp[CRYPTO_BYTES]; + /* * This is most likely going to be aligned by the compiler. * 16 extra bytes for canary @@ -202,8 +244,30 @@ static int test_sign_detached(void) { RETURNS_ZERO(crypto_sign_keypair(pk + 8, sk + 8)); randombytes(m + 8, MLEN); + + // Fill the sig buffer with random bytes + randombytes(sig_random_cmp, CRYPTO_BYTES); + memcpy(sig + 8, sig_random_cmp, CRYPTO_BYTES); + +#ifdef PQCLEAN_USE_VALGRIND + /* + * With this buffer marked as undefined, valgrind will detect + * cases where the signing code depends on the value of the tail + * of the buffer. + */ + VALGRIND_MAKE_MEM_UNDEFINED(sig + 8, CRYPTO_BYTES); +#endif + RETURNS_ZERO(crypto_sign_signature(sig + 8, &siglen, m + 8, MLEN, sk + 8)); +#ifdef PQCLEAN_USE_VALGRIND + // We have to mark the tail as defined before doing the memcmp. + VALGRIND_MAKE_MEM_DEFINED(sig + 8 + siglen, CRYPTO_BYTES - siglen); +#endif + + // check that the tail has not been modified + RETURNS_ZERO(memcmp(sig + 8 + siglen, sig_random_cmp + siglen, CRYPTO_BYTES - siglen)); + if ((returncode = crypto_sign_verify(sig + 8, siglen, m + 8, MLEN, pk + 8)) != 0) { fprintf(stderr, "ERROR Signature did not verify correctly!\n"); diff --git a/test/test_valgrind.py b/test/test_valgrind.py index e2d17fb9..24b59873 100644 --- a/test/test_valgrind.py +++ b/test/test_valgrind.py @@ -45,7 +45,7 @@ def test_valgrind(implementation: pqclean.Implementation, impl_path, test_dir, SCHEME_DIR=os.path.abspath(impl_path), IMPLEMENTATION=implementation.name, DEST_DIR=dest_dir, - EXTRAFLAGS="-gdwarf-4", + EXTRAFLAGS="-gdwarf-4 -DPQCLEAN_USE_VALGRIND", NTESTS=1, working_dir=os.path.join(test_dir, 'test')) functest_name = './functest_{}_{}'.format(implementation.scheme.name, From fcc6d5d7e12ccd6e9df5b2fb7b6cab073c1eacc0 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 13:42:09 +0100 Subject: [PATCH 60/85] update kyber aarch64 --- crypto_kem/kyber1024/aarch64/LICENSE | 125 +-- crypto_kem/kyber1024/aarch64/NTT_params.h | 20 +- crypto_kem/kyber1024/aarch64/__asm_NTT.S | 539 ++++++----- crypto_kem/kyber1024/aarch64/__asm_base_mul.S | 893 ++++++++++++++---- crypto_kem/kyber1024/aarch64/__asm_iNTT.S | 486 +++++----- crypto_kem/kyber1024/aarch64/__asm_poly.S | 35 +- crypto_kem/kyber1024/aarch64/api.h | 6 +- crypto_kem/kyber1024/aarch64/cbd.c | 39 + crypto_kem/kyber1024/aarch64/cbd.h | 2 +- crypto_kem/kyber1024/aarch64/feat.S | 6 +- crypto_kem/kyber1024/aarch64/fips202x2.c | 70 +- crypto_kem/kyber1024/aarch64/fips202x2.h | 9 +- crypto_kem/kyber1024/aarch64/indcpa.c | 160 +++- crypto_kem/kyber1024/aarch64/indcpa.h | 6 +- crypto_kem/kyber1024/aarch64/kem.c | 17 +- crypto_kem/kyber1024/aarch64/kem.h | 14 +- crypto_kem/kyber1024/aarch64/macros.inc | 342 ++++++- .../kyber1024/aarch64/macros_common.inc | 842 ++++++++++++++++- crypto_kem/kyber1024/aarch64/neon_poly.c | 15 +- crypto_kem/kyber1024/aarch64/neon_polyvec.c | 6 +- .../kyber1024/aarch64/neon_symmetric-shake.c | 9 +- crypto_kem/kyber1024/aarch64/ntt.c | 32 +- crypto_kem/kyber1024/aarch64/ntt.h | 73 +- crypto_kem/kyber1024/aarch64/params.h | 5 +- crypto_kem/kyber1024/aarch64/poly.c | 36 +- crypto_kem/kyber1024/aarch64/poly.h | 4 +- crypto_kem/kyber1024/aarch64/polyvec.c | 46 +- crypto_kem/kyber1024/aarch64/polyvec.h | 11 +- crypto_kem/kyber1024/aarch64/reduce.h | 6 +- crypto_kem/kyber1024/aarch64/rejsample.h | 2 +- .../kyber1024/aarch64/symmetric-shake.c | 2 + crypto_kem/kyber1024/aarch64/symmetric.h | 6 +- crypto_kem/kyber1024/aarch64/verify.h | 2 +- crypto_kem/kyber512/aarch64/LICENSE | 125 +-- crypto_kem/kyber512/aarch64/NTT_params.h | 20 +- crypto_kem/kyber512/aarch64/__asm_NTT.S | 539 ++++++----- crypto_kem/kyber512/aarch64/__asm_base_mul.S | 893 ++++++++++++++---- crypto_kem/kyber512/aarch64/__asm_iNTT.S | 486 +++++----- crypto_kem/kyber512/aarch64/__asm_poly.S | 35 +- crypto_kem/kyber512/aarch64/api.h | 6 +- crypto_kem/kyber512/aarch64/cbd.c | 14 + crypto_kem/kyber512/aarch64/cbd.h | 2 +- crypto_kem/kyber512/aarch64/feat.S | 6 +- crypto_kem/kyber512/aarch64/fips202x2.c | 70 +- crypto_kem/kyber512/aarch64/fips202x2.h | 9 +- crypto_kem/kyber512/aarch64/indcpa.c | 164 +++- crypto_kem/kyber512/aarch64/indcpa.h | 6 +- crypto_kem/kyber512/aarch64/kem.c | 17 +- crypto_kem/kyber512/aarch64/kem.h | 14 +- crypto_kem/kyber512/aarch64/macros.inc | 342 ++++++- crypto_kem/kyber512/aarch64/macros_common.inc | 842 ++++++++++++++++- crypto_kem/kyber512/aarch64/neon_poly.c | 15 +- crypto_kem/kyber512/aarch64/neon_polyvec.c | 6 +- .../kyber512/aarch64/neon_symmetric-shake.c | 9 +- crypto_kem/kyber512/aarch64/ntt.c | 32 +- crypto_kem/kyber512/aarch64/ntt.h | 73 +- crypto_kem/kyber512/aarch64/params.h | 5 +- crypto_kem/kyber512/aarch64/poly.c | 49 +- crypto_kem/kyber512/aarch64/poly.h | 4 +- crypto_kem/kyber512/aarch64/polyvec.c | 55 +- crypto_kem/kyber512/aarch64/polyvec.h | 11 +- crypto_kem/kyber512/aarch64/reduce.h | 6 +- crypto_kem/kyber512/aarch64/rejsample.h | 2 +- crypto_kem/kyber512/aarch64/symmetric-shake.c | 2 + crypto_kem/kyber512/aarch64/symmetric.h | 6 +- crypto_kem/kyber512/aarch64/verify.h | 2 +- crypto_kem/kyber768/aarch64/LICENSE | 125 +-- crypto_kem/kyber768/aarch64/NTT_params.h | 20 +- crypto_kem/kyber768/aarch64/__asm_NTT.S | 539 ++++++----- crypto_kem/kyber768/aarch64/__asm_base_mul.S | 893 ++++++++++++++---- crypto_kem/kyber768/aarch64/__asm_iNTT.S | 486 +++++----- crypto_kem/kyber768/aarch64/__asm_poly.S | 35 +- crypto_kem/kyber768/aarch64/api.h | 6 +- crypto_kem/kyber768/aarch64/cbd.c | 39 + crypto_kem/kyber768/aarch64/cbd.h | 2 +- crypto_kem/kyber768/aarch64/feat.S | 6 +- crypto_kem/kyber768/aarch64/fips202x2.c | 70 +- crypto_kem/kyber768/aarch64/fips202x2.h | 9 +- crypto_kem/kyber768/aarch64/indcpa.c | 98 +- crypto_kem/kyber768/aarch64/indcpa.h | 6 +- crypto_kem/kyber768/aarch64/kem.c | 17 +- crypto_kem/kyber768/aarch64/kem.h | 14 +- crypto_kem/kyber768/aarch64/macros.inc | 342 ++++++- crypto_kem/kyber768/aarch64/macros_common.inc | 842 ++++++++++++++++- crypto_kem/kyber768/aarch64/neon_poly.c | 15 +- crypto_kem/kyber768/aarch64/neon_polyvec.c | 6 +- .../kyber768/aarch64/neon_symmetric-shake.c | 9 +- crypto_kem/kyber768/aarch64/ntt.c | 32 +- crypto_kem/kyber768/aarch64/ntt.h | 73 +- crypto_kem/kyber768/aarch64/params.h | 1 + crypto_kem/kyber768/aarch64/poly.c | 49 +- crypto_kem/kyber768/aarch64/poly.h | 4 +- crypto_kem/kyber768/aarch64/polyvec.c | 55 +- crypto_kem/kyber768/aarch64/polyvec.h | 11 +- crypto_kem/kyber768/aarch64/reduce.h | 6 +- crypto_kem/kyber768/aarch64/rejsample.h | 2 +- crypto_kem/kyber768/aarch64/symmetric-shake.c | 2 + crypto_kem/kyber768/aarch64/symmetric.h | 6 +- crypto_kem/kyber768/aarch64/verify.h | 2 +- 99 files changed, 8511 insertions(+), 3044 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/LICENSE b/crypto_kem/kyber1024/aarch64/LICENSE index 0e259d42..093b0a7d 100644 --- a/crypto_kem/kyber1024/aarch64/LICENSE +++ b/crypto_kem/kyber1024/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/crypto_kem/kyber1024/aarch64/NTT_params.h b/crypto_kem/kyber1024/aarch64/NTT_params.h index d0934820..f2607092 100644 --- a/crypto_kem/kyber1024/aarch64/NTT_params.h +++ b/crypto_kem/kyber1024/aarch64/NTT_params.h @@ -2,7 +2,9 @@ #define NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -40,27 +42,27 @@ #define invomegaQ1 1175 // R = 2^15 below // RmodQ1 = 2^15 mod^{+-} Q1 -#define RmodQ1 (-522) +#define RmodQ1 -522 // Q1prime = Q1^{-1} mod^{+-} 2^15 -#define Q1prime (-3327) +#define Q1prime -3327 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 3303 // R2modQ1 = 2^16 mod^{+-} Q1 -#define R2modQ1 (-1044) +#define R2modQ1 -1044 // Q1prime2 = -Q1^{-1} mod^{+-} 2^16 #define Q1prime2 3327 // R3modQ1 = -2^32 mod^{+-} Q1 -#define R3modQ1 (-1353) +#define R3modQ1 -1353 // R3modQ1_prime = (R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime (-20552) +#define R3modQ1_prime -20552 // R3modQ1_prime_half = ( (R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime_half (-10276) +#define R3modQ1_prime_half -10276 // R3modQ1_doubleprime (R3modQ1_prime Q1 - (R3modQ1 + Q1)) / 2^16 -#define R3modQ1_doubleprime (-1044) +#define R3modQ1_doubleprime -1044 // invNQ1_R3modQ1 = -NTT_N^{-1} 2^32 mod^{+-} Q1 -#define invNQ1_R3modQ1 (-1441) +#define invNQ1_R3modQ1 -1441 // invNQ1_R3modQ1_prime = (invNQ1_R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 #define invNQ1_R3modQ1_prime 10080 // invNQ1_R3modQ1_prime_half = ( (invNQ1_R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 diff --git a/crypto_kem/kyber1024/aarch64/__asm_NTT.S b/crypto_kem/kyber1024/aarch64/__asm_NTT.S index 0469fcd6..65fa23f9 100644 --- a/crypto_kem/kyber1024/aarch64/__asm_NTT.S +++ b/crypto_kem/kyber1024/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,165 +36,188 @@ PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top: _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top: - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 + push_simd + Q .req w8 + src .req x0 + table .req x1 + counter .req x11 ldrsh Q, [x2, #0] - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 + ldr q0, [table, # 0*16] + ldr q1, [table, # 1*16] + ldr q2, [table, # 2*16] + ldr q3, [table, # 3*16] mov v0.H[0], Q - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr q13, [src, # 9*32] + ldr q15, [src, #11*32] + ldr q17, [src, #13*32] + ldr q19, [src, #15*32] + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32 + + qo_butterfly_mixll \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32, \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixsls \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_botsls \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #9*32, #11*32, #13*32, #15*32, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32 + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) + + qo_butterfly_mixl \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixss \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_botss \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 + .unreq src .unreq table .unreq counter - pop_all - - br lr + pop_simd + ret .align 2 .global PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot @@ -199,13 +225,13 @@ _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top: PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: - push_all - Q .req w20 - BarrettM .req w21 + push_simd + Q .req w8 + BarrettM .req w9 src0 .req x0 src1 .req x1 - table .req x28 - counter .req x19 + table .req x10 + counter .req x11 ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] @@ -215,99 +241,127 @@ _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, # 1*16] + ldr q29, [src1, # 1*16] + ldr q30, [src0, # 3*16] + ldr q31, [src1, # 3*16] + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 mov counter, #3 _ntt_bot_loop: - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + str q28, [src0, # 1*16] + ldr q28, [src0, #(64+1*16)] + str q29, [src1, # 1*16] + ldr q29, [src1, #(64+1*16)] + str q30, [src0, # 3*16] + ldr q30, [src0, #(64+3*16)] + str q31, [src1, # 3*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + str q28, [src0, # 1*16] + str q29, [src1, # 1*16] + str q30, [src0, # 3*16] + str q31, [src1, # 3*16] + + add src0, src0, #64 + add src1, src1, #64 .unreq Q .unreq BarrettM @@ -315,12 +369,9 @@ _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: .unreq src1 .unreq table .unreq counter - pop_all - - br lr - - + pop_simd + ret diff --git a/crypto_kem/kyber1024/aarch64/__asm_base_mul.S b/crypto_kem/kyber1024/aarch64/__asm_base_mul.S index 1b7aed00..fe18783c 100644 --- a/crypto_kem/kyber1024/aarch64/__asm_base_mul.S +++ b/crypto_kem/kyber1024/aarch64/__asm_base_mul.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,44 +47,195 @@ _PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended: ldrsh Q, [x3] - dup v20.8H, Q - - // TODO: unroll this, currently we are using only 16 SIMD registers - mov counter, #4 - _point_mul_extended_loop: - - ld2 { v0.8H, v1.8H}, [src1], #32 - ld2 { v2.8H, v3.8H}, [src1], #32 - ld2 { v4.8H, v5.8H}, [src1], #32 - ld2 { v6.8H, v7.8H}, [src1], #32 + dup v28.8H, Q - ld2 { v8.8H, v9.8H}, [src2ex], #32 - ld2 {v10.8H, v11.8H}, [src2ex], #32 - ld2 {v12.8H, v13.8H}, [src2ex], #32 - ld2 {v14.8H, v15.8H}, [src2ex], #32 + ldr q0, [src1, #0*16] + ldr q1, [src1, #1*16] + ldr q2, [src1, #2*16] + ldr q3, [src1, #3*16] + ldr q4, [src1, #4*16] + ldr q5, [src1, #5*16] + ldr q6, [src1, #6*16] + ldr q7, [src1, #7*16] + + add src1, src1, #8*16 + + uzp2 v1.8H, v0.8H, v1.8H + uzp2 v3.8H, v2.8H, v3.8H + uzp2 v5.8H, v4.8H, v5.8H + uzp2 v7.8H, v6.8H, v7.8H + + ldr q8, [src2ex, #0*16] + ldr q10, [src2ex, #2*16] + ldr q12, [src2ex, #4*16] + ldr q14, [src2ex, #6*16] + ldr q9, [src2ex, #1*16] + ldr q11, [src2ex, #3*16] + ldr q13, [src2ex, #5*16] + ldr q15, [src2ex, #7*16] + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] + sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] + sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] + sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] + sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] + mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] + mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] + mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] + mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q0, [src1, #0*16] + sqrdmulh v16.8H, v17.8H, v8.8H + ldr q1, [src1, #1*16] + sqrdmulh v18.8H, v19.8H, v10.8H + ldr q2, [src1, #2*16] + sqrdmulh v20.8H, v21.8H, v12.8H + ldr q3, [src1, #3*16] + sqrdmulh v22.8H, v23.8H, v14.8H + + ldr q4, [src1, #4*16] + mul v17.8H, v17.8H, v9.8H + uzp2 v1.8H, v0.8H, v1.8H + ldr q5, [src1, #5*16] + mul v19.8H, v19.8H, v11.8H + uzp2 v3.8H, v2.8H, v3.8H + ldr q6, [src1, #6*16] + mul v21.8H, v21.8H, v13.8H + uzp2 v5.8H, v4.8H, v5.8H + ldr q7, [src1, #7*16] + mul v23.8H, v23.8H, v15.8H + uzp2 v7.8H, v6.8H, v7.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v17.8H, v16.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v19.8H, v18.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v21.8H, v20.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v23.8H, v22.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q17, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q19, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q21, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q23, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H - mls v1.8H, v0.8H, v20.8H - mls v3.8H, v2.8H, v20.8H - mls v5.8H, v4.8H, v20.8H - mls v7.8H, v6.8H, v20.8H + add src1, src1, #8*16 - st1 { v1.8H}, [des], #16 - st1 { v3.8H}, [des], #16 - st1 { v5.8H}, [des], #16 - st1 { v7.8H}, [des], #16 + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + sqrdmulh v16.8H, v17.8H, v8.8H + sqrdmulh v18.8H, v19.8H, v10.8H + sqrdmulh v20.8H, v21.8H, v12.8H + sqrdmulh v22.8H, v23.8H, v14.8H + + mul v17.8H, v17.8H, v9.8H + mul v19.8H, v19.8H, v11.8H + mul v21.8H, v21.8H, v13.8H + mul v23.8H, v23.8H, v15.8H + + mls v17.8H, v16.8H, v28.8H + mls v19.8H, v18.8H, v28.8H + mls v21.8H, v20.8H, v28.8H + mls v23.8H, v22.8H, v28.8H + + str q17, [des, #0*16] + str q19, [des, #1*16] + str q21, [des, #2*16] + str q23, [des, #3*16] + + add des, des, #4*16 - sub counter, counter, #1 - cbnz counter, _point_mul_extended_loop .unreq Q .unreq des @@ -90,7 +244,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended: .unreq counter pop_all - br lr + ret .align 2 @@ -100,8 +254,6 @@ PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: push_all - Q .req w28 - Qprime2 .req w27 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -117,8 +269,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] + ldr s4, [x3] add des, x4, #0 @@ -138,94 +289,294 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: add src2asy_3, src2asy_0, #256*3 #endif - dup v28.8H, Q - dup v29.8H, Qprime2 + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - // TODO:interleaving - mov counter, #16 - _asymmetric_mul_loop: + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + // TODO:interleaving + mov counter, #15 + _asymmetric_mul_loop: + + ldr q20, [src1_0, #0*16] + uzp1 v6.8H, v16.8H, v18.8H + ldr q21, [src1_0, #1*16] + uzp1 v7.8H, v17.8H, v19.8H + + ldr q22, [src2_0, #0*16] + mul v6.8H, v6.8H, v4.H[1] + ldr q23, [src2_0, #1*16] + mul v7.8H, v7.8H, v4.H[1] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + smlal v16.4S, v6.4H, v4.H[0] + uzp1 v0.8H, v20.8H, v21.8H + smlal2 v18.4S, v6.8H, v4.H[0] + uzp2 v1.8H, v20.8H, v21.8H + smlal v17.4S, v7.4H, v4.H[0] + uzp1 v2.8H, v22.8H, v23.8H + smlal2 v19.4S, v7.8H, v4.H[0] + uzp2 v3.8H, v22.8H, v23.8H + + ld1 {v28.8H}, [src2asy_0], #16 + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H +#endif + +#else - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - st2 {v24.8H, v25.8H}, [des], #32 +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_loop - .unreq Q - .unreq Qprime2 + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -242,7 +593,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: .unreq counter pop_all - br lr + ret .align 2 @@ -252,10 +603,6 @@ PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: push_all - Q .req w28 - Qprime2 .req w27 - R3 .req w26 - R3p .req w25 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -271,11 +618,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] - - ldrsh R3, [x3, #8] - ldrsh R3p, [x3, #10] + ldr q4, [x3] add des, x4, #0 @@ -295,108 +638,312 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: add src2asy_3, src2asy_0, #256*3 #endif - dup v26.8H, R3 - dup v27.8H, R3p + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - dup v28.8H, Q - dup v29.8H, Qprime2 + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 - // TODO: interleaving - mov counter, #16 - _asymmetric_mul_montgomery_loop: + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + mov counter, #15 + _asymmetric_mul_montgomery_loop: + + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + ldr q20, [src1_0, #0*16] + smlal v16.4S, v6.4H, v4.H[0] + ldr q21, [src1_0, #1*16] + smlal2 v18.4S, v6.8H, v4.H[0] + ldr q22, [src2_0, #0*16] + smlal v17.4S, v7.4H, v4.H[0] + ldr q23, [src2_0, #1*16] + smlal2 v19.4S, v7.8H, v4.H[0] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H + uzp1 v0.8H, v20.8H, v21.8H + sqrdmulh v16.8H, v6.8H, v4.H[4] + uzp2 v1.8H, v20.8H, v21.8H + sqrdmulh v17.8H, v7.8H, v4.H[4] - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + uzp1 v2.8H, v22.8H, v23.8H + mul v6.8H, v6.8H, v4.H[5] + uzp2 v3.8H, v22.8H, v23.8H + mul v7.8H, v7.8H, v4.H[5] - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] - sqrdmulh v16.8H, v24.8H, v26.8H - sqrdmulh v17.8H, v25.8H, v26.8H + st2 { v6.8H, v7.8H}, [des], #32 - mul v24.8H, v24.8H, v27.8H - mul v25.8H, v25.8H, v27.8H + ld1 {v28.8H}, [src2asy_0], #16 - mls v24.8H, v16.8H, v28.8H - mls v25.8H, v17.8H, v28.8H + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H + +#endif - st2 {v24.8H, v25.8H}, [des], #32 +#else + + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H + +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_montgomery_loop - .unreq Q - .unreq Qprime2 - .unreq R3 - .unreq R3p + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + sqrdmulh v16.8H, v6.8H, v4.H[4] + sqrdmulh v17.8H, v7.8H, v4.H[4] + + mul v6.8H, v6.8H, v4.H[5] + mul v7.8H, v7.8H, v4.H[5] + + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -413,7 +960,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: .unreq counter pop_all - br lr + ret diff --git a/crypto_kem/kyber1024/aarch64/__asm_iNTT.S b/crypto_kem/kyber1024/aarch64/__asm_iNTT.S index 930b519c..58a524ac 100644 --- a/crypto_kem/kyber1024/aarch64/__asm_iNTT.S +++ b/crypto_kem/kyber1024/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -49,57 +52,116 @@ _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - mov counter, #4 + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, #1*16] + ldr q29, [src1, #1*16] + ldr q30, [src0, #3*16] + ldr q31, [src1, #3*16] + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 + + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 + + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 + + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 + + mov counter, #3 _intt_bot_loop: - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM + str q24, [src0, #0*16] + ldr q28, [src0, #(64+1*16)] + str q25, [src1, #0*16] + ldr q29, [src1, #(64+1*16)] + str q26, [src0, #2*16] + ldr q30, [src0, #(64+3*16)] + str q27, [src1, #2*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 - do_butterfly_vec_mixed_rev v28, v30, v18, v19, v29, v31, v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, v8, v9, v10, v11 - do_butterfly_vec_mixed_rev v24, v26, v16, v17, v25, v27, v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, v6, v7, v6, v7 - do_butterfly_vec_mixed_rev v28, v29, v18, v19, v30, v31, v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, v4, v5, v4, v5 - do_butterfly_vec_mixed_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 - do_butterfly_vec_mixed_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 - qo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v1, #11, v0 + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 sub counter, counter, #1 cbnz counter, _intt_bot_loop + str q24, [src0, #0*16] + str q25, [src1, #0*16] + str q26, [src0, #2*16] + str q27, [src1, #2*16] + + .unreq Q .unreq BarrettM .unreq src0 @@ -108,7 +170,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot: .unreq counter pop_all - br lr + ret .align 2 .global PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top @@ -121,245 +183,131 @@ _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top: BarrettM .req w21 invN .req w22 invN_f .req w23 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 + table .req x1 counter .req x19 - ldrsh Q, [x2, #0] + ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] - ldr invN, [x2, #10] - ldr invN_f, [x2, #14] - - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 - - mov v0.H[0], Q - - dup v24.8H, Q - dup v25.8H, BarrettM - - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr invN, [x2, #10] + ldr invN_f, [x2, #14] + + mov v4.S[0], invN + mov v4.S[1], invN_f + + ldr q0, [table, #0*16] + mov v0.H[0], Q + + ldr q1, [table, #1*16] + ldr q2, [table, #2*16] + ldr q3, [table, #3*16] + + ldr q16, [src, # 8*32] + ldr q17, [src, # 9*32] + ldr q18, [src, #10*32] + ldr q19, [src, #11*32] + ldr q20, [src, #12*32] + ldr q21, [src, #13*32] + ldr q22, [src, #14*32] + ldr q23, [src, #15*32] + + qo_butterfly_botll \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q12, q13, q14, q15, \ + #4*32, #5*32, #6*32, #7*32 + + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_topsl \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #8*32, #9*32, #10*32, #11*32, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + qo_montgomery_mul_insl \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #12*32, #13*32, #14*32, #15*32, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_butterfly_botsl_mul \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32), \ + v12, v13, v14, v15, v24, v25, v26, v27, \ + v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0 + + str q12, [src, # 4*32] + ldr q12, [src, #(16+ 4*32)] + str q13, [src, # 5*32] + ldr q13, [src, #(16+ 5*32)] + str q14, [src, # 6*32] + ldr q14, [src, #(16+ 6*32)] + str q15, [src, # 7*32] + ldr q15, [src, #(16+ 7*32)] + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_tops \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + + qo_montgomery_mul_ins \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_montgomery_mul_ins \ + v12, v13, v14, v15, v24, v25, v26, v27, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32) + + str q12, [src, #(16+ 4*32)] + str q13, [src, #(16+ 5*32)] + str q14, [src, #(16+ 6*32)] + str q15, [src, #(16+ 7*32)] .unreq Q .unreq BarrettM .unreq invN .unreq invN_f - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr - - - - + ret diff --git a/crypto_kem/kyber1024/aarch64/__asm_poly.S b/crypto_kem/kyber1024/aarch64/__asm_poly.S index 00fec3d0..7d461016 100644 --- a/crypto_kem/kyber1024/aarch64/__asm_poly.S +++ b/crypto_kem/kyber1024/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_KYBER1024_AARCH64_asm_add_reduce -.global _PQCLEAN_KYBER1024_AARCH64_asm_add_reduce -PQCLEAN_KYBER1024_AARCH64_asm_add_reduce: -_PQCLEAN_KYBER1024_AARCH64_asm_add_reduce: +.global PQCLEAN_KYBER1024_AARCH64__asm_add_reduce +.global _PQCLEAN_KYBER1024_AARCH64__asm_add_reduce +PQCLEAN_KYBER1024_AARCH64__asm_add_reduce: +_PQCLEAN_KYBER1024_AARCH64__asm_add_reduce: mov w4, #3329 mov w5, #25519 @@ -86,13 +89,13 @@ _PQCLEAN_KYBER1024_AARCH64_asm_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce -.global _PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce -PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce: -_PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce: +.global PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce +.global _PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce +PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce: +_PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce: mov w4, #3329 mov w5, #25519 @@ -147,13 +150,13 @@ _PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce -.global _PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce -PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce: -_PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce: +.global PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce +.global _PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce +PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce: +_PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce: mov w4, #3329 mov w5, #25519 @@ -232,7 +235,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v20.8H, v21.8H, v22.8H, v23.8H}, [x0], #64 - br lr + ret diff --git a/crypto_kem/kyber1024/aarch64/api.h b/crypto_kem/kyber1024/aarch64/api.h index e09f90cb..217634f3 100644 --- a/crypto_kem/kyber1024/aarch64/api.h +++ b/crypto_kem/kyber1024/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef PQCLEAN_KYBER1024_AARCH64_API_H -#define PQCLEAN_KYBER1024_AARCH64_API_H +#ifndef API_H +#define API_H /* * This file is licensed @@ -13,7 +13,7 @@ #define PQCLEAN_KYBER1024_AARCH64_CRYPTO_PUBLICKEYBYTES 1568 #define PQCLEAN_KYBER1024_AARCH64_CRYPTO_CIPHERTEXTBYTES 1568 #define PQCLEAN_KYBER1024_AARCH64_CRYPTO_BYTES 32 -#define PQCLEAN_KYBER1024_AARCH64_CRYPTO_ALGNAME "Kyber1024" +#define PQCLEAN_KYBER1024_AARCH64_CRYPTO_ALGNAME "Kyber1024" int PQCLEAN_KYBER1024_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); diff --git a/crypto_kem/kyber1024/aarch64/cbd.c b/crypto_kem/kyber1024/aarch64/cbd.c index 6ae95c03..a96d0516 100644 --- a/crypto_kem/kyber1024/aarch64/cbd.c +++ b/crypto_kem/kyber1024/aarch64/cbd.c @@ -127,6 +127,15 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) **************************************************/ +#if KYBER_ETA1 == 3 +static uint32_t load24_littleendian(const uint8_t x[3]) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} +#endif /************************************************* * Name: cbd3 @@ -139,11 +148,41 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ +#if KYBER_ETA1 == 3 +static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { + unsigned int i, j; + uint32_t t, d; + int16_t a, b; + + for (i = 0; i < KYBER_N / 4; i++) { + t = load24_littleendian(buf + 3 * i); + d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) { + a = (d >> (6 * j + 0)) & 0x7; + b = (d >> (6 * j + 3)) & 0x7; + r[4 * i + j] = a - b; + } + } +} +#endif void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + #if KYBER_ETA1 == 2 neon_cbd2(r, buf); + #elif KYBER_ETA1 == 3 + cbd3(r, buf); + #else +#error "This implementation requires eta1 in {2,3}" + #endif } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + #if KYBER_ETA2 == 2 neon_cbd2(r, buf); + #else +#error "This implementation requires eta2 = 2" + #endif } diff --git a/crypto_kem/kyber1024/aarch64/cbd.h b/crypto_kem/kyber1024/aarch64/cbd.h index 47a06806..688abf43 100644 --- a/crypto_kem/kyber1024/aarch64/cbd.h +++ b/crypto_kem/kyber1024/aarch64/cbd.h @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "poly.h" -#include #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1) void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); diff --git a/crypto_kem/kyber1024/aarch64/feat.S b/crypto_kem/kyber1024/aarch64/feat.S index d7dda5bc..f467fa80 100644 --- a/crypto_kem/kyber1024/aarch64/feat.S +++ b/crypto_kem/kyber1024/aarch64/feat.S @@ -123,10 +123,8 @@ SOFTWARE. .endm .align 4 -.global PQCLEAN_KYBER1024_AARCH64_f1600x2 -.global _PQCLEAN_KYBER1024_AARCH64_f1600x2 -PQCLEAN_KYBER1024_AARCH64_f1600x2: -_PQCLEAN_KYBER1024_AARCH64_f1600x2: +.global _f1600x2 +_f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! diff --git a/crypto_kem/kyber1024/aarch64/fips202x2.c b/crypto_kem/kyber1024/aarch64/fips202x2.c index 3cefe848..e045ee3d 100644 --- a/crypto_kem/kyber1024/aarch64/fips202x2.c +++ b/crypto_kem/kyber1024/aarch64/fips202x2.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -36,6 +37,11 @@ #include #include "fips202x2.h" +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + #define NROUNDS 24 // Define NEON operation @@ -47,20 +53,20 @@ #define vxor(c, a, b) c = veorq_u64(a, b); // Rotate by n bit ((a << offset) ^ (a >> (64-offset))) #define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); // Xor chain: out = a ^ b ^ c ^ d ^ e #define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); // Not And c = ~a & b // #define vbic(c, a, b) c = vbicq_u64(b, a); // Xor Not And: out = a ^ ( (~b) & c) #define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); // Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support #define vrxor(c, a, b) c = vrax1q_u64(a, b); // End Define @@ -100,11 +106,11 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { * * Arguments: - uint64_t *state: pointer to input/output Keccak state **************************************************/ -extern void PQCLEAN_KYBER1024_AARCH64_f1600x2(v128 *, const uint64_t *); +extern void f1600x2(v128 *, const uint64_t *); static inline void KeccakF1600_StatePermutex2(v128 state[25]) { #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_KYBER1024_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); + f1600x2(state, neon_KeccakF_RoundConstants); #else v128 Aba, Abe, Abi, Abo, Abu; v128 Aga, Age, Agi, Ago, Agu; @@ -551,7 +557,14 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -570,7 +583,14 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -587,7 +607,14 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -606,7 +633,14 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -625,6 +659,9 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -644,6 +681,10 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -662,6 +703,9 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -681,4 +725,8 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } diff --git a/crypto_kem/kyber1024/aarch64/fips202x2.h b/crypto_kem/kyber1024/aarch64/fips202x2.h index 14ceb782..3066c52b 100644 --- a/crypto_kem/kyber1024/aarch64/fips202x2.h +++ b/crypto_kem/kyber1024/aarch64/fips202x2.h @@ -8,9 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" -#include #include +#include typedef uint64x2_t v128; @@ -23,31 +22,26 @@ typedef struct { v128 s[25]; } keccakx2_state; -#define shake128x2_absorb KYBER_NAMESPACE(shake128x2_absorb) void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake128x2_squeezeblocks KYBER_NAMESPACE(shake128x2_squeezeblocks) void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake256x2_absorb KYBER_NAMESPACE(shake256x2_absorb) void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake256x2_squeezeblocks KYBER_NAMESPACE(shake256x2_squeezeblocks) void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake128x2 KYBER_NAMESPACE(shake128x2) void shake128x2(uint8_t *out0, uint8_t *out1, size_t outlen, @@ -55,7 +49,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in1, size_t inlen); -#define shake256x2 KYBER_NAMESPACE(shake256x2) void shake256x2(uint8_t *out0, uint8_t *out1, size_t outlen, diff --git a/crypto_kem/kyber1024/aarch64/indcpa.c b/crypto_kem/kyber1024/aarch64/indcpa.c index 8648f17b..93c0f9b7 100644 --- a/crypto_kem/kyber1024/aarch64/indcpa.c +++ b/crypto_kem/kyber1024/aarch64/indcpa.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -161,6 +162,126 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; neon_xof_state state; + #if KYBER_K == 2 + for (unsigned int i = 0; i < KYBER_K; i++) { + if (transposed) { + neon_xof_absorb(&state, seed, i, i, 0, 1); + } else { + neon_xof_absorb(&state, seed, 0, 1, i, i); + } + + neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); + + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + + ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0); + ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1); + while (ctr0 < KYBER_N || ctr1 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + buf1[k] = buf1[buflen - off + k]; + } + neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); + ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); + } + } + #elif KYBER_K == 3 + int16_t *s1 = NULL, *s2 = NULL; + unsigned int x1, x2, y1, y2; + xof_state c_state; + + for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2) { + switch (j) { + case 0: + s1 = &(a[0][0][0]); + s2 = &(a[0][1][0]); + x1 = 0; + y1 = 0; + x2 = 0; + y2 = 1; + break; + case 2: + s1 = &(a[0][2][0]); + s2 = &(a[1][0][0]); + x1 = 0; + y1 = 2; + x2 = 1; + y2 = 0; + break; + case 4: + s1 = &(a[1][1][0]); + s2 = &(a[1][2][0]); + x1 = 1; + y1 = 1; + x2 = 1; + y2 = 2; + break; + default: + s1 = &(a[2][0][0]); + s2 = &(a[2][1][0]); + x1 = 2; + y1 = 0; + x2 = 2; + y2 = 1; + break; + } + + if (transposed) { + neon_xof_absorb(&state, seed, x1, x2, y1, y2); + } else { + neon_xof_absorb(&state, seed, y1, y2, x1, x2); + } + + neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); + + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + + ctr0 = neon_rej_uniform(s1, buf0); + ctr1 = neon_rej_uniform(s2, buf1); + + while (ctr0 < KYBER_N || ctr1 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + buf1[k] = buf1[buflen - off + k]; + } + neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen); + ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen); + } + } + + // Last iteration [2][2] + if (transposed) { + xof_absorb(&c_state, seed, 2, 2); + } else { + xof_absorb(&c_state, seed, 2, 2); + } + + xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state); + + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + + ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0); + + while (ctr0 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + } + xof_squeezeblocks(buf0 + off, 1, &c_state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); + } + + #elif KYBER_K == 4 for (unsigned int i = 0; i < KYBER_K; i++) { for (unsigned int j = 0; j < KYBER_K; j += 2) { if (transposed) { @@ -188,6 +309,9 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S } } } + #else +#error "KYBER_K must be in {2,3,4}" + #endif } /************************************************* @@ -202,8 +326,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]) { + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; uint8_t buf[2 * KYBER_SYMBYTES]; const uint8_t *publicseed = buf; @@ -218,10 +342,19 @@ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], gen_a(a, publicseed); + #if KYBER_K == 2 + neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); + neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3); + #elif KYBER_K == 3 + neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); + neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3); + neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5); + #elif KYBER_K == 4 neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3); neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5); neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7); + #endif neon_polyvec_ntt(skpv); neon_polyvec_ntt(e); @@ -277,11 +410,32 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); + #if KYBER_K == 2 + // ETA1 != ETA2 (3 != 2) + neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); + neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3); + neon_poly_getnoise_eta2(&(epp[0]), coins, 4); + #elif KYBER_K == 3 + #if KYBER_ETA1 == KYBER_ETA2 + // Because ETA1 == ETA2 + neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); + neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3); + neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5); + neon_poly_getnoise_eta2(&(epp[0]), coins, 6); + #else +#error "We need eta1 == eta2 here" + #endif + #elif KYBER_K == 4 + #if KYBER_ETA1 == KYBER_ETA2 neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3); neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5); neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7); neon_poly_getnoise_eta2(&(epp[0]), coins, 8); + #else +#error "We need eta1 == eta2 here" + #endif + #endif neon_polyvec_ntt(sp); diff --git a/crypto_kem/kyber1024/aarch64/indcpa.h b/crypto_kem/kyber1024/aarch64/indcpa.h index f93487a3..30608327 100644 --- a/crypto_kem/kyber1024/aarch64/indcpa.h +++ b/crypto_kem/kyber1024/aarch64/indcpa.h @@ -7,16 +7,16 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "polyvec.h" -#include #define gen_matrix KYBER_NAMESPACE(gen_matrix) void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); #define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand) void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]); + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]); #define indcpa_enc KYBER_NAMESPACE(indcpa_enc) void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], diff --git a/crypto_kem/kyber1024/aarch64/kem.c b/crypto_kem/kyber1024/aarch64/kem.c index 670a4c59..a71d5ac6 100644 --- a/crypto_kem/kyber1024/aarch64/kem.c +++ b/crypto_kem/kyber1024/aarch64/kem.c @@ -8,12 +8,15 @@ #include #include #include + +#include "api.h" #include "params.h" +#include "kem.h" #include "indcpa.h" #include "verify.h" #include "symmetric.h" #include "randombytes.h" -#include "kem.h" + /************************************************* * Name: crypto_kem_keypair_derand @@ -31,8 +34,8 @@ * Returns 0 (success) **************************************************/ int crypto_kem_keypair_derand(uint8_t *pk, - uint8_t *sk, - const uint8_t *coins) { + uint8_t *sk, + const uint8_t *coins) { indcpa_keypair_derand(pk, sk, coins); memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES); hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); @@ -62,6 +65,8 @@ int crypto_kem_keypair(uint8_t *pk, return 0; } + + /************************************************* * Name: crypto_kem_enc_derand * @@ -80,9 +85,9 @@ int crypto_kem_keypair(uint8_t *pk, * Returns 0 (success) **************************************************/ int crypto_kem_enc_derand(uint8_t *ct, - uint8_t *ss, - const uint8_t *pk, - const uint8_t *coins) { + uint8_t *ss, + const uint8_t *pk, + const uint8_t *coins) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber1024/aarch64/kem.h b/crypto_kem/kyber1024/aarch64/kem.h index f6f0bd69..afb78598 100644 --- a/crypto_kem/kyber1024/aarch64/kem.h +++ b/crypto_kem/kyber1024/aarch64/kem.h @@ -7,15 +7,16 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" -#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES -#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES -#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES -#define CRYPTO_BYTES KYBER_SSBYTES - +#if (KYBER_K == 2) +#define CRYPTO_ALGNAME "Kyber512" +#elif (KYBER_K == 3) +#define CRYPTO_ALGNAME "Kyber768" +#elif (KYBER_K == 4) #define CRYPTO_ALGNAME "Kyber1024" +#endif #define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand) int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins); @@ -33,3 +34,4 @@ int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif + diff --git a/crypto_kem/kyber1024/aarch64/macros.inc b/crypto_kem/kyber1024/aarch64/macros.inc index 2add309e..5504405c 100644 --- a/crypto_kem/kyber1024/aarch64/macros.inc +++ b/crypto_kem/kyber1024/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,11 +28,114 @@ * SOFTWARE. */ -#ifndef MACROS_S -#define MACROS_S - #include "macros_common.inc" +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + .macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H .endm @@ -52,16 +158,68 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + .macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm -.macro qo_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm -.macro qo_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H .endm @@ -69,18 +227,176 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + .macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm -.macro do_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H .endm -.macro do_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm -#endif +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== + +.macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 +.endm + +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm +// === 32-bit end ==== diff --git a/crypto_kem/kyber1024/aarch64/macros_common.inc b/crypto_kem/kyber1024/aarch64/macros_common.inc index c1ac021c..07568491 100644 --- a/crypto_kem/kyber1024/aarch64/macros_common.inc +++ b/crypto_kem/kyber1024/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,31 +35,51 @@ .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -75,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -85,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -102,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -138,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -152,34 +310,340 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] mul \t4\wX, \b4\wX, \z4\nX[\h4] sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] mul \t5\wX, \b5\wX, \z5\nX[\h5] sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] mul \t6\wX, \b6\wX, \z6\nX[\h6] sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] mul \t7\wX, \b7\wX, \z7\nX[\h7] + ldr \d0, [\srcd_ptr, \memd0] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] add \a2\wX, \a2\wX, \t2\wX sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] add \a3\wX, \a3\wX, \t3\wX sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -221,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -231,7 +769,53 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D + +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX @@ -248,15 +832,77 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + ldr \c0, [\srcc_ptr, \memc0] mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] mul \t1\wX, \b1\wX, \h1\wX sub \b3\wX, \a3\wX, \t3\wX + ldr \c2, [\srcc_ptr, \memc2] sqrdmulh \b0\wX, \b0\wX, \l0\wX add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] sqrdmulh \b1\wX, \b1\wX, \l1\wX add \a3\wX, \a3\wX, \t3\wX @@ -269,53 +915,53 @@ .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -394,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX diff --git a/crypto_kem/kyber1024/aarch64/neon_poly.c b/crypto_kem/kyber1024/aarch64/neon_poly.c index fc958102..e7ae26ba 100644 --- a/crypto_kem/kyber1024/aarch64/neon_poly.c +++ b/crypto_kem/kyber1024/aarch64/neon_poly.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -129,14 +130,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(int16_t *, const int16_t *); void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(c, a); + PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(c, a); } -extern void PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); +extern void PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) { - PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(c, a, b); + PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(c, a, b); } /************************************************* @@ -150,7 +151,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(int16_t *, const int16_t *); void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(c, a); + PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(c, a); } diff --git a/crypto_kem/kyber1024/aarch64/neon_polyvec.c b/crypto_kem/kyber1024/aarch64/neon_polyvec.c index c05f59d6..8787fcde 100644 --- a/crypto_kem/kyber1024/aarch64/neon_polyvec.c +++ b/crypto_kem/kyber1024/aarch64/neon_polyvec.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -83,7 +84,7 @@ void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]) { * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]) { +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { // c = c + a; @@ -91,4 +92,3 @@ void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYB neon_poly_add_reduce(c[i], a[i]); } } - diff --git a/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c b/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c index a5a2e783..aa096294 100644 --- a/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c +++ b/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -88,8 +89,8 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce1, uint8_t nonce2) { unsigned int i; - uint8_t extkey1[KYBER_SYMBYTES + 1 + 15]; - uint8_t extkey2[KYBER_SYMBYTES + 1 + 15]; + uint8_t extkey1[KYBER_SYMBYTES + 1]; + uint8_t extkey2[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey1[i] = key[i]; @@ -99,5 +100,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, extkey1[i] = nonce1; extkey2[i] = nonce2; - shake256x2(out1, out2, outlen, extkey1, extkey2, KYBER_SYMBYTES + 1); + shake256x2(out1, out2, outlen, extkey1, extkey2, sizeof(extkey1)); } diff --git a/crypto_kem/kyber1024/aarch64/ntt.c b/crypto_kem/kyber1024/aarch64/ntt.c index 8bca765e..69cb756f 100644 --- a/crypto_kem/kyber1024/aarch64/ntt.c +++ b/crypto_kem/kyber1024/aarch64/ntt.c @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,12 +28,35 @@ * SOFTWARE. */ -#include +#include #include "params.h" #include "ntt.h" -#include "reduce.h" #include "NTT_params.h" +const __attribute__ ((aligned (16)))int16_t asymmetric_const[8] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t constants[16] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, + invNQ1_R3modQ1_prime_half, + invNQ1_R3modQ1_doubleprime, + invNQ1_final_R3modQ1_prime_half, + invNQ1_final_R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { +0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 +}; + +const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { +167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { +0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 +}; + /************************************************* * Name: ntt * diff --git a/crypto_kem/kyber1024/aarch64/ntt.h b/crypto_kem/kyber1024/aarch64/ntt.h index fc6d4a94..4e29cfda 100644 --- a/crypto_kem/kyber1024/aarch64/ntt.h +++ b/crypto_kem/kyber1024/aarch64/ntt.h @@ -2,11 +2,14 @@ #define NTT_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,11 +37,6 @@ extern const int16_t zetas[128]; -#define ntt KYBER_NAMESPACE(ntt) -void ntt(int16_t r[256]); -#define invntt KYBER_NAMESPACE(invntt) -void invntt(int16_t r[256]); - extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); @@ -49,38 +47,35 @@ extern void PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended(int16_t *, const i extern void PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); extern void PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); -static const int16_t asymmetric_const[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime -}; - -#define NTT(in) { \ - PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } - -#define iNTT(in) { \ - PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } - -static const int16_t constants[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, - invNQ1_R3modQ1_prime_half, - invNQ1_R3modQ1_doubleprime, - invNQ1_final_R3modQ1_prime_half, - invNQ1_final_R3modQ1_doubleprime -}; - -static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0 -}; - -static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { - 167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175 -}; - -static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0 -}; +extern +const int16_t asymmetric_const[8]; +extern +const int16_t constants[16]; + +extern +const int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; + +extern +const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; + +extern +const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; + + +#define NTT(in) do { \ + PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define ntt KYBER_NAMESPACE(ntt) +void ntt(int16_t r[256]); +#define invntt KYBER_NAMESPACE(invntt) +void invntt(int16_t r[256]); + #endif diff --git a/crypto_kem/kyber1024/aarch64/params.h b/crypto_kem/kyber1024/aarch64/params.h index f6cb8131..455d12a4 100644 --- a/crypto_kem/kyber1024/aarch64/params.h +++ b/crypto_kem/kyber1024/aarch64/params.h @@ -7,11 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -//#define KYBER_90S /* Uncomment this if you want the 90S variant */ - #define KYBER_NAMESPACE(s) PQCLEAN_KYBER1024_AARCH64_##s -#define KYBER_K 4 +/* Don't change parameters below this line */ #define KYBER_N 256 #define KYBER_Q 3329 @@ -21,6 +19,7 @@ #define KYBER_POLYBYTES 384 #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) +#define KYBER_K 4 #define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 160 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) diff --git a/crypto_kem/kyber1024/aarch64/poly.c b/crypto_kem/kyber1024/aarch64/poly.c index 6250e60a..9e7abbd0 100644 --- a/crypto_kem/kyber1024/aarch64/poly.c +++ b/crypto_kem/kyber1024/aarch64/poly.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/kyber/blob/master/ref * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -53,6 +54,22 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N int16_t u; uint8_t t[8]; + #if (KYBER_POLYCOMPRESSEDBYTES == 128) + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + // map to positive standard representatives + u = a[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint16_t)u << 4) + KYBER_Q / 2) / KYBER_Q) & 15; + } + + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; + } + #elif (KYBER_POLYCOMPRESSEDBYTES == 160) for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { // map to positive standard representatives @@ -68,6 +85,9 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N r[4] = (t[6] >> 2) | (t[7] << 3); r += 5; } + #else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" + #endif } /************************************************* @@ -83,6 +103,13 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { unsigned int i; + #if (KYBER_POLYCOMPRESSEDBYTES == 128) + for (i = 0; i < KYBER_N / 2; i++) { + r[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; + r[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; + a += 1; + } + #elif (KYBER_POLYCOMPRESSEDBYTES == 160) unsigned int j; uint8_t t[8]; for (i = 0; i < KYBER_N / 8; i++) { @@ -100,6 +127,9 @@ void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYT r[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; } } + #else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" + #endif } /************************************************* @@ -185,6 +215,10 @@ void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]) unsigned int i, j; int16_t mask; + #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) +#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" + #endif + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { mask = -(int16_t)((msg[i] >> j) & 1); diff --git a/crypto_kem/kyber1024/aarch64/poly.h b/crypto_kem/kyber1024/aarch64/poly.h index 83c35067..ae6bf04d 100644 --- a/crypto_kem/kyber1024/aarch64/poly.h +++ b/crypto_kem/kyber1024/aarch64/poly.h @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial @@ -30,7 +30,7 @@ void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const int16_t a[KYBER_N]); #define poly_frommsg KYBER_NAMESPACE(poly_frommsg) void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]); #define poly_tomsg KYBER_NAMESPACE(poly_tomsg) -void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t a[KYBER_N]); +void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t r[KYBER_N]); // NEON diff --git a/crypto_kem/kyber1024/aarch64/polyvec.c b/crypto_kem/kyber1024/aarch64/polyvec.c index 7142cb39..8907c316 100644 --- a/crypto_kem/kyber1024/aarch64/polyvec.c +++ b/crypto_kem/kyber1024/aarch64/polyvec.c @@ -19,9 +19,10 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; + #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 8; j++) { @@ -45,6 +46,27 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K r += 11; } } + #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + t[k] = a[i][4 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + } + + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; + } + } + #else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" + #endif } /************************************************* @@ -60,6 +82,7 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; + #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 8; j++) { @@ -78,6 +101,24 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV } } } + #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r[i][4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } + } + } + #else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" + #endif } /************************************************* @@ -89,7 +130,7 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV * (needs space for KYBER_POLYVECBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { poly_tobytes(r + i * KYBER_POLYBYTES, a[i]); @@ -112,4 +153,3 @@ void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVE poly_frombytes(r[i], a + i * KYBER_POLYBYTES); } } - diff --git a/crypto_kem/kyber1024/aarch64/polyvec.h b/crypto_kem/kyber1024/aarch64/polyvec.h index 827610d6..69e7db9c 100644 --- a/crypto_kem/kyber1024/aarch64/polyvec.h +++ b/crypto_kem/kyber1024/aarch64/polyvec.h @@ -7,8 +7,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -34,21 +35,21 @@ * SOFTWARE. */ +#include #include "params.h" #include "poly.h" -#include typedef struct { poly vec[KYBER_K]; } polyvec; #define polyvec_compress KYBER_NAMESPACE(polyvec_compress) -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress) void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes) -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes) void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECBYTES]); @@ -61,6 +62,6 @@ void neon_polyvec_ntt(int16_t r[KYBER_K][KYBER_N]); void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]); #define neon_polyvec_add_reduce KYBER_NAMESPACE(polyvec_add_reduce) -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]); +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]); #endif diff --git a/crypto_kem/kyber1024/aarch64/reduce.h b/crypto_kem/kyber1024/aarch64/reduce.h index 7d0f8e3b..4a7c3426 100644 --- a/crypto_kem/kyber1024/aarch64/reduce.h +++ b/crypto_kem/kyber1024/aarch64/reduce.h @@ -7,11 +7,11 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" -#define MONT (-1044) // 2^16 mod q -#define QINV (-3327) // q^-1 mod 2^16 +#define MONT -1044 // 2^16 mod q +#define QINV -3327 // q^-1 mod 2^16 #define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce) int16_t montgomery_reduce(int32_t a); diff --git a/crypto_kem/kyber1024/aarch64/rejsample.h b/crypto_kem/kyber1024/aarch64/rejsample.h index ee9ae85c..7a9fb471 100644 --- a/crypto_kem/kyber1024/aarch64/rejsample.h +++ b/crypto_kem/kyber1024/aarch64/rejsample.h @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" #define neon_rej_uniform KYBER_NAMESPACE(_neon_rej_uniform) unsigned int neon_rej_uniform(int16_t *r, diff --git a/crypto_kem/kyber1024/aarch64/symmetric-shake.c b/crypto_kem/kyber1024/aarch64/symmetric-shake.c index 067922ec..14a4c28c 100644 --- a/crypto_kem/kyber1024/aarch64/symmetric-shake.c +++ b/crypto_kem/kyber1024/aarch64/symmetric-shake.c @@ -55,6 +55,8 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM shake256(out, outlen, extkey, sizeof(extkey)); } + + /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf * diff --git a/crypto_kem/kyber1024/aarch64/symmetric.h b/crypto_kem/kyber1024/aarch64/symmetric.h index cb9ea69e..2a59b8b8 100644 --- a/crypto_kem/kyber1024/aarch64/symmetric.h +++ b/crypto_kem/kyber1024/aarch64/symmetric.h @@ -8,9 +8,9 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include #include +#include "params.h" #include "fips202.h" @@ -27,6 +27,7 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf) void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]); + #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) @@ -36,6 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) + // NEON Definition #include "fips202x2.h" @@ -65,3 +67,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, shake128x2_squeezeblocks(OUT0, OUT1, OUTBLOCKS, STATE) #endif /* SYMMETRIC_H */ + + diff --git a/crypto_kem/kyber1024/aarch64/verify.h b/crypto_kem/kyber1024/aarch64/verify.h index 3b9eca9f..ac78bc35 100644 --- a/crypto_kem/kyber1024/aarch64/verify.h +++ b/crypto_kem/kyber1024/aarch64/verify.h @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include #include +#include "params.h" #define verify KYBER_NAMESPACE(verify) int verify(const uint8_t *a, const uint8_t *b, size_t len); diff --git a/crypto_kem/kyber512/aarch64/LICENSE b/crypto_kem/kyber512/aarch64/LICENSE index 0e259d42..093b0a7d 100644 --- a/crypto_kem/kyber512/aarch64/LICENSE +++ b/crypto_kem/kyber512/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/crypto_kem/kyber512/aarch64/NTT_params.h b/crypto_kem/kyber512/aarch64/NTT_params.h index d0934820..f2607092 100644 --- a/crypto_kem/kyber512/aarch64/NTT_params.h +++ b/crypto_kem/kyber512/aarch64/NTT_params.h @@ -2,7 +2,9 @@ #define NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -40,27 +42,27 @@ #define invomegaQ1 1175 // R = 2^15 below // RmodQ1 = 2^15 mod^{+-} Q1 -#define RmodQ1 (-522) +#define RmodQ1 -522 // Q1prime = Q1^{-1} mod^{+-} 2^15 -#define Q1prime (-3327) +#define Q1prime -3327 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 3303 // R2modQ1 = 2^16 mod^{+-} Q1 -#define R2modQ1 (-1044) +#define R2modQ1 -1044 // Q1prime2 = -Q1^{-1} mod^{+-} 2^16 #define Q1prime2 3327 // R3modQ1 = -2^32 mod^{+-} Q1 -#define R3modQ1 (-1353) +#define R3modQ1 -1353 // R3modQ1_prime = (R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime (-20552) +#define R3modQ1_prime -20552 // R3modQ1_prime_half = ( (R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime_half (-10276) +#define R3modQ1_prime_half -10276 // R3modQ1_doubleprime (R3modQ1_prime Q1 - (R3modQ1 + Q1)) / 2^16 -#define R3modQ1_doubleprime (-1044) +#define R3modQ1_doubleprime -1044 // invNQ1_R3modQ1 = -NTT_N^{-1} 2^32 mod^{+-} Q1 -#define invNQ1_R3modQ1 (-1441) +#define invNQ1_R3modQ1 -1441 // invNQ1_R3modQ1_prime = (invNQ1_R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 #define invNQ1_R3modQ1_prime 10080 // invNQ1_R3modQ1_prime_half = ( (invNQ1_R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 diff --git a/crypto_kem/kyber512/aarch64/__asm_NTT.S b/crypto_kem/kyber512/aarch64/__asm_NTT.S index 47b75efa..ebcf6bd5 100644 --- a/crypto_kem/kyber512/aarch64/__asm_NTT.S +++ b/crypto_kem/kyber512/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,165 +36,188 @@ PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top: _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top: - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 + push_simd + Q .req w8 + src .req x0 + table .req x1 + counter .req x11 ldrsh Q, [x2, #0] - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 + ldr q0, [table, # 0*16] + ldr q1, [table, # 1*16] + ldr q2, [table, # 2*16] + ldr q3, [table, # 3*16] mov v0.H[0], Q - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr q13, [src, # 9*32] + ldr q15, [src, #11*32] + ldr q17, [src, #13*32] + ldr q19, [src, #15*32] + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32 + + qo_butterfly_mixll \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32, \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixsls \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_botsls \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #9*32, #11*32, #13*32, #15*32, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32 + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) + + qo_butterfly_mixl \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixss \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_botss \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 + .unreq src .unreq table .unreq counter - pop_all - - br lr + pop_simd + ret .align 2 .global PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot @@ -199,13 +225,13 @@ _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top: PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: - push_all - Q .req w20 - BarrettM .req w21 + push_simd + Q .req w8 + BarrettM .req w9 src0 .req x0 src1 .req x1 - table .req x28 - counter .req x19 + table .req x10 + counter .req x11 ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] @@ -215,99 +241,127 @@ _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, # 1*16] + ldr q29, [src1, # 1*16] + ldr q30, [src0, # 3*16] + ldr q31, [src1, # 3*16] + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 mov counter, #3 _ntt_bot_loop: - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + str q28, [src0, # 1*16] + ldr q28, [src0, #(64+1*16)] + str q29, [src1, # 1*16] + ldr q29, [src1, #(64+1*16)] + str q30, [src0, # 3*16] + ldr q30, [src0, #(64+3*16)] + str q31, [src1, # 3*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + str q28, [src0, # 1*16] + str q29, [src1, # 1*16] + str q30, [src0, # 3*16] + str q31, [src1, # 3*16] + + add src0, src0, #64 + add src1, src1, #64 .unreq Q .unreq BarrettM @@ -315,12 +369,9 @@ _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: .unreq src1 .unreq table .unreq counter - pop_all - - br lr - - + pop_simd + ret diff --git a/crypto_kem/kyber512/aarch64/__asm_base_mul.S b/crypto_kem/kyber512/aarch64/__asm_base_mul.S index 1c346564..a75bb649 100644 --- a/crypto_kem/kyber512/aarch64/__asm_base_mul.S +++ b/crypto_kem/kyber512/aarch64/__asm_base_mul.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,44 +47,195 @@ _PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended: ldrsh Q, [x3] - dup v20.8H, Q - - // TODO: unroll this, currently we are using only 16 SIMD registers - mov counter, #4 - _point_mul_extended_loop: - - ld2 { v0.8H, v1.8H}, [src1], #32 - ld2 { v2.8H, v3.8H}, [src1], #32 - ld2 { v4.8H, v5.8H}, [src1], #32 - ld2 { v6.8H, v7.8H}, [src1], #32 + dup v28.8H, Q - ld2 { v8.8H, v9.8H}, [src2ex], #32 - ld2 {v10.8H, v11.8H}, [src2ex], #32 - ld2 {v12.8H, v13.8H}, [src2ex], #32 - ld2 {v14.8H, v15.8H}, [src2ex], #32 + ldr q0, [src1, #0*16] + ldr q1, [src1, #1*16] + ldr q2, [src1, #2*16] + ldr q3, [src1, #3*16] + ldr q4, [src1, #4*16] + ldr q5, [src1, #5*16] + ldr q6, [src1, #6*16] + ldr q7, [src1, #7*16] + + add src1, src1, #8*16 + + uzp2 v1.8H, v0.8H, v1.8H + uzp2 v3.8H, v2.8H, v3.8H + uzp2 v5.8H, v4.8H, v5.8H + uzp2 v7.8H, v6.8H, v7.8H + + ldr q8, [src2ex, #0*16] + ldr q10, [src2ex, #2*16] + ldr q12, [src2ex, #4*16] + ldr q14, [src2ex, #6*16] + ldr q9, [src2ex, #1*16] + ldr q11, [src2ex, #3*16] + ldr q13, [src2ex, #5*16] + ldr q15, [src2ex, #7*16] + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] + sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] + sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] + sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] + sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] + mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] + mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] + mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] + mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q0, [src1, #0*16] + sqrdmulh v16.8H, v17.8H, v8.8H + ldr q1, [src1, #1*16] + sqrdmulh v18.8H, v19.8H, v10.8H + ldr q2, [src1, #2*16] + sqrdmulh v20.8H, v21.8H, v12.8H + ldr q3, [src1, #3*16] + sqrdmulh v22.8H, v23.8H, v14.8H + + ldr q4, [src1, #4*16] + mul v17.8H, v17.8H, v9.8H + uzp2 v1.8H, v0.8H, v1.8H + ldr q5, [src1, #5*16] + mul v19.8H, v19.8H, v11.8H + uzp2 v3.8H, v2.8H, v3.8H + ldr q6, [src1, #6*16] + mul v21.8H, v21.8H, v13.8H + uzp2 v5.8H, v4.8H, v5.8H + ldr q7, [src1, #7*16] + mul v23.8H, v23.8H, v15.8H + uzp2 v7.8H, v6.8H, v7.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v17.8H, v16.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v19.8H, v18.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v21.8H, v20.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v23.8H, v22.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q17, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q19, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q21, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q23, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H - mls v1.8H, v0.8H, v20.8H - mls v3.8H, v2.8H, v20.8H - mls v5.8H, v4.8H, v20.8H - mls v7.8H, v6.8H, v20.8H + add src1, src1, #8*16 - st1 { v1.8H}, [des], #16 - st1 { v3.8H}, [des], #16 - st1 { v5.8H}, [des], #16 - st1 { v7.8H}, [des], #16 + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + sqrdmulh v16.8H, v17.8H, v8.8H + sqrdmulh v18.8H, v19.8H, v10.8H + sqrdmulh v20.8H, v21.8H, v12.8H + sqrdmulh v22.8H, v23.8H, v14.8H + + mul v17.8H, v17.8H, v9.8H + mul v19.8H, v19.8H, v11.8H + mul v21.8H, v21.8H, v13.8H + mul v23.8H, v23.8H, v15.8H + + mls v17.8H, v16.8H, v28.8H + mls v19.8H, v18.8H, v28.8H + mls v21.8H, v20.8H, v28.8H + mls v23.8H, v22.8H, v28.8H + + str q17, [des, #0*16] + str q19, [des, #1*16] + str q21, [des, #2*16] + str q23, [des, #3*16] + + add des, des, #4*16 - sub counter, counter, #1 - cbnz counter, _point_mul_extended_loop .unreq Q .unreq des @@ -90,7 +244,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended: .unreq counter pop_all - br lr + ret .align 2 @@ -100,8 +254,6 @@ PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: push_all - Q .req w28 - Qprime2 .req w27 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -117,8 +269,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] + ldr s4, [x3] add des, x4, #0 @@ -138,94 +289,294 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: add src2asy_3, src2asy_0, #256*3 #endif - dup v28.8H, Q - dup v29.8H, Qprime2 + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - // TODO:interleaving - mov counter, #16 - _asymmetric_mul_loop: + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + // TODO:interleaving + mov counter, #15 + _asymmetric_mul_loop: + + ldr q20, [src1_0, #0*16] + uzp1 v6.8H, v16.8H, v18.8H + ldr q21, [src1_0, #1*16] + uzp1 v7.8H, v17.8H, v19.8H + + ldr q22, [src2_0, #0*16] + mul v6.8H, v6.8H, v4.H[1] + ldr q23, [src2_0, #1*16] + mul v7.8H, v7.8H, v4.H[1] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + smlal v16.4S, v6.4H, v4.H[0] + uzp1 v0.8H, v20.8H, v21.8H + smlal2 v18.4S, v6.8H, v4.H[0] + uzp2 v1.8H, v20.8H, v21.8H + smlal v17.4S, v7.4H, v4.H[0] + uzp1 v2.8H, v22.8H, v23.8H + smlal2 v19.4S, v7.8H, v4.H[0] + uzp2 v3.8H, v22.8H, v23.8H + + ld1 {v28.8H}, [src2asy_0], #16 + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H +#endif + +#else - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - st2 {v24.8H, v25.8H}, [des], #32 +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_loop - .unreq Q - .unreq Qprime2 + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -242,7 +593,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: .unreq counter pop_all - br lr + ret .align 2 @@ -252,10 +603,6 @@ PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: push_all - Q .req w28 - Qprime2 .req w27 - R3 .req w26 - R3p .req w25 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -271,11 +618,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] - - ldrsh R3, [x3, #8] - ldrsh R3p, [x3, #10] + ldr q4, [x3] add des, x4, #0 @@ -295,108 +638,312 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: add src2asy_3, src2asy_0, #256*3 #endif - dup v26.8H, R3 - dup v27.8H, R3p + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - dup v28.8H, Q - dup v29.8H, Qprime2 + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 - // TODO: interleaving - mov counter, #16 - _asymmetric_mul_montgomery_loop: + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + mov counter, #15 + _asymmetric_mul_montgomery_loop: + + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + ldr q20, [src1_0, #0*16] + smlal v16.4S, v6.4H, v4.H[0] + ldr q21, [src1_0, #1*16] + smlal2 v18.4S, v6.8H, v4.H[0] + ldr q22, [src2_0, #0*16] + smlal v17.4S, v7.4H, v4.H[0] + ldr q23, [src2_0, #1*16] + smlal2 v19.4S, v7.8H, v4.H[0] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H + uzp1 v0.8H, v20.8H, v21.8H + sqrdmulh v16.8H, v6.8H, v4.H[4] + uzp2 v1.8H, v20.8H, v21.8H + sqrdmulh v17.8H, v7.8H, v4.H[4] - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + uzp1 v2.8H, v22.8H, v23.8H + mul v6.8H, v6.8H, v4.H[5] + uzp2 v3.8H, v22.8H, v23.8H + mul v7.8H, v7.8H, v4.H[5] - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] - sqrdmulh v16.8H, v24.8H, v26.8H - sqrdmulh v17.8H, v25.8H, v26.8H + st2 { v6.8H, v7.8H}, [des], #32 - mul v24.8H, v24.8H, v27.8H - mul v25.8H, v25.8H, v27.8H + ld1 {v28.8H}, [src2asy_0], #16 - mls v24.8H, v16.8H, v28.8H - mls v25.8H, v17.8H, v28.8H + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H + +#endif - st2 {v24.8H, v25.8H}, [des], #32 +#else + + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H + +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_montgomery_loop - .unreq Q - .unreq Qprime2 - .unreq R3 - .unreq R3p + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + sqrdmulh v16.8H, v6.8H, v4.H[4] + sqrdmulh v17.8H, v7.8H, v4.H[4] + + mul v6.8H, v6.8H, v4.H[5] + mul v7.8H, v7.8H, v4.H[5] + + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -413,7 +960,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: .unreq counter pop_all - br lr + ret diff --git a/crypto_kem/kyber512/aarch64/__asm_iNTT.S b/crypto_kem/kyber512/aarch64/__asm_iNTT.S index 7acb200f..57fb734f 100644 --- a/crypto_kem/kyber512/aarch64/__asm_iNTT.S +++ b/crypto_kem/kyber512/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -49,57 +52,116 @@ _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - mov counter, #4 + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, #1*16] + ldr q29, [src1, #1*16] + ldr q30, [src0, #3*16] + ldr q31, [src1, #3*16] + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 + + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 + + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 + + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 + + mov counter, #3 _intt_bot_loop: - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM + str q24, [src0, #0*16] + ldr q28, [src0, #(64+1*16)] + str q25, [src1, #0*16] + ldr q29, [src1, #(64+1*16)] + str q26, [src0, #2*16] + ldr q30, [src0, #(64+3*16)] + str q27, [src1, #2*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 - do_butterfly_vec_mixed_rev v28, v30, v18, v19, v29, v31, v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, v8, v9, v10, v11 - do_butterfly_vec_mixed_rev v24, v26, v16, v17, v25, v27, v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, v6, v7, v6, v7 - do_butterfly_vec_mixed_rev v28, v29, v18, v19, v30, v31, v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, v4, v5, v4, v5 - do_butterfly_vec_mixed_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 - do_butterfly_vec_mixed_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 - qo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v1, #11, v0 + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 sub counter, counter, #1 cbnz counter, _intt_bot_loop + str q24, [src0, #0*16] + str q25, [src1, #0*16] + str q26, [src0, #2*16] + str q27, [src1, #2*16] + + .unreq Q .unreq BarrettM .unreq src0 @@ -108,7 +170,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot: .unreq counter pop_all - br lr + ret .align 2 .global PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top @@ -121,245 +183,131 @@ _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top: BarrettM .req w21 invN .req w22 invN_f .req w23 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 + table .req x1 counter .req x19 - ldrsh Q, [x2, #0] + ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] - ldr invN, [x2, #10] - ldr invN_f, [x2, #14] - - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 - - mov v0.H[0], Q - - dup v24.8H, Q - dup v25.8H, BarrettM - - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr invN, [x2, #10] + ldr invN_f, [x2, #14] + + mov v4.S[0], invN + mov v4.S[1], invN_f + + ldr q0, [table, #0*16] + mov v0.H[0], Q + + ldr q1, [table, #1*16] + ldr q2, [table, #2*16] + ldr q3, [table, #3*16] + + ldr q16, [src, # 8*32] + ldr q17, [src, # 9*32] + ldr q18, [src, #10*32] + ldr q19, [src, #11*32] + ldr q20, [src, #12*32] + ldr q21, [src, #13*32] + ldr q22, [src, #14*32] + ldr q23, [src, #15*32] + + qo_butterfly_botll \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q12, q13, q14, q15, \ + #4*32, #5*32, #6*32, #7*32 + + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_topsl \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #8*32, #9*32, #10*32, #11*32, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + qo_montgomery_mul_insl \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #12*32, #13*32, #14*32, #15*32, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_butterfly_botsl_mul \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32), \ + v12, v13, v14, v15, v24, v25, v26, v27, \ + v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0 + + str q12, [src, # 4*32] + ldr q12, [src, #(16+ 4*32)] + str q13, [src, # 5*32] + ldr q13, [src, #(16+ 5*32)] + str q14, [src, # 6*32] + ldr q14, [src, #(16+ 6*32)] + str q15, [src, # 7*32] + ldr q15, [src, #(16+ 7*32)] + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_tops \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + + qo_montgomery_mul_ins \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_montgomery_mul_ins \ + v12, v13, v14, v15, v24, v25, v26, v27, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32) + + str q12, [src, #(16+ 4*32)] + str q13, [src, #(16+ 5*32)] + str q14, [src, #(16+ 6*32)] + str q15, [src, #(16+ 7*32)] .unreq Q .unreq BarrettM .unreq invN .unreq invN_f - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr - - - - + ret diff --git a/crypto_kem/kyber512/aarch64/__asm_poly.S b/crypto_kem/kyber512/aarch64/__asm_poly.S index d3dcefc6..175d01ab 100644 --- a/crypto_kem/kyber512/aarch64/__asm_poly.S +++ b/crypto_kem/kyber512/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_KYBER512_AARCH64_asm_add_reduce -.global _PQCLEAN_KYBER512_AARCH64_asm_add_reduce -PQCLEAN_KYBER512_AARCH64_asm_add_reduce: -_PQCLEAN_KYBER512_AARCH64_asm_add_reduce: +.global PQCLEAN_KYBER512_AARCH64__asm_add_reduce +.global _PQCLEAN_KYBER512_AARCH64__asm_add_reduce +PQCLEAN_KYBER512_AARCH64__asm_add_reduce: +_PQCLEAN_KYBER512_AARCH64__asm_add_reduce: mov w4, #3329 mov w5, #25519 @@ -86,13 +89,13 @@ _PQCLEAN_KYBER512_AARCH64_asm_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER512_AARCH64_asm_sub_reduce -.global _PQCLEAN_KYBER512_AARCH64_asm_sub_reduce -PQCLEAN_KYBER512_AARCH64_asm_sub_reduce: -_PQCLEAN_KYBER512_AARCH64_asm_sub_reduce: +.global PQCLEAN_KYBER512_AARCH64__asm_sub_reduce +.global _PQCLEAN_KYBER512_AARCH64__asm_sub_reduce +PQCLEAN_KYBER512_AARCH64__asm_sub_reduce: +_PQCLEAN_KYBER512_AARCH64__asm_sub_reduce: mov w4, #3329 mov w5, #25519 @@ -147,13 +150,13 @@ _PQCLEAN_KYBER512_AARCH64_asm_sub_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce -.global _PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce -PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce: -_PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce: +.global PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce +.global _PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce +PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce: +_PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce: mov w4, #3329 mov w5, #25519 @@ -232,7 +235,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v20.8H, v21.8H, v22.8H, v23.8H}, [x0], #64 - br lr + ret diff --git a/crypto_kem/kyber512/aarch64/api.h b/crypto_kem/kyber512/aarch64/api.h index 8f1010d6..97d81a11 100644 --- a/crypto_kem/kyber512/aarch64/api.h +++ b/crypto_kem/kyber512/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef PQCLEAN_KYBER512_AARCH64_API_H -#define PQCLEAN_KYBER512_AARCH64_API_H +#ifndef API_H +#define API_H /* * This file is licensed @@ -13,7 +13,7 @@ #define PQCLEAN_KYBER512_AARCH64_CRYPTO_PUBLICKEYBYTES 800 #define PQCLEAN_KYBER512_AARCH64_CRYPTO_CIPHERTEXTBYTES 768 #define PQCLEAN_KYBER512_AARCH64_CRYPTO_BYTES 32 -#define PQCLEAN_KYBER512_AARCH64_CRYPTO_ALGNAME "Kyber512" +#define PQCLEAN_KYBER512_AARCH64_CRYPTO_ALGNAME "Kyber512" int PQCLEAN_KYBER512_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); diff --git a/crypto_kem/kyber512/aarch64/cbd.c b/crypto_kem/kyber512/aarch64/cbd.c index c26fd7fd..a96d0516 100644 --- a/crypto_kem/kyber512/aarch64/cbd.c +++ b/crypto_kem/kyber512/aarch64/cbd.c @@ -127,6 +127,7 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) **************************************************/ +#if KYBER_ETA1 == 3 static uint32_t load24_littleendian(const uint8_t x[3]) { uint32_t r; r = (uint32_t)x[0]; @@ -134,6 +135,7 @@ static uint32_t load24_littleendian(const uint8_t x[3]) { r |= (uint32_t)x[2] << 16; return r; } +#endif /************************************************* * Name: cbd3 @@ -146,6 +148,7 @@ static uint32_t load24_littleendian(const uint8_t x[3]) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ +#if KYBER_ETA1 == 3 static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; @@ -164,11 +167,22 @@ static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { } } } +#endif void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + #if KYBER_ETA1 == 2 + neon_cbd2(r, buf); + #elif KYBER_ETA1 == 3 cbd3(r, buf); + #else +#error "This implementation requires eta1 in {2,3}" + #endif } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + #if KYBER_ETA2 == 2 neon_cbd2(r, buf); + #else +#error "This implementation requires eta2 = 2" + #endif } diff --git a/crypto_kem/kyber512/aarch64/cbd.h b/crypto_kem/kyber512/aarch64/cbd.h index 47a06806..688abf43 100644 --- a/crypto_kem/kyber512/aarch64/cbd.h +++ b/crypto_kem/kyber512/aarch64/cbd.h @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "poly.h" -#include #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1) void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); diff --git a/crypto_kem/kyber512/aarch64/feat.S b/crypto_kem/kyber512/aarch64/feat.S index c214d6f3..f467fa80 100644 --- a/crypto_kem/kyber512/aarch64/feat.S +++ b/crypto_kem/kyber512/aarch64/feat.S @@ -123,10 +123,8 @@ SOFTWARE. .endm .align 4 -.global PQCLEAN_KYBER512_AARCH64_f1600x2 -.global _PQCLEAN_KYBER512_AARCH64_f1600x2 -PQCLEAN_KYBER512_AARCH64_f1600x2: -_PQCLEAN_KYBER512_AARCH64_f1600x2: +.global _f1600x2 +_f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! diff --git a/crypto_kem/kyber512/aarch64/fips202x2.c b/crypto_kem/kyber512/aarch64/fips202x2.c index 464d5309..e045ee3d 100644 --- a/crypto_kem/kyber512/aarch64/fips202x2.c +++ b/crypto_kem/kyber512/aarch64/fips202x2.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -36,6 +37,11 @@ #include #include "fips202x2.h" +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + #define NROUNDS 24 // Define NEON operation @@ -47,20 +53,20 @@ #define vxor(c, a, b) c = veorq_u64(a, b); // Rotate by n bit ((a << offset) ^ (a >> (64-offset))) #define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); // Xor chain: out = a ^ b ^ c ^ d ^ e #define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); // Not And c = ~a & b // #define vbic(c, a, b) c = vbicq_u64(b, a); // Xor Not And: out = a ^ ( (~b) & c) #define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); // Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support #define vrxor(c, a, b) c = vrax1q_u64(a, b); // End Define @@ -100,11 +106,11 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { * * Arguments: - uint64_t *state: pointer to input/output Keccak state **************************************************/ -extern void PQCLEAN_KYBER512_AARCH64_f1600x2(v128 *, const uint64_t *); +extern void f1600x2(v128 *, const uint64_t *); static inline void KeccakF1600_StatePermutex2(v128 state[25]) { #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_KYBER512_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); + f1600x2(state, neon_KeccakF_RoundConstants); #else v128 Aba, Abe, Abi, Abo, Abu; v128 Aga, Age, Agi, Ago, Agu; @@ -551,7 +557,14 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -570,7 +583,14 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -587,7 +607,14 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -606,7 +633,14 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -625,6 +659,9 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -644,6 +681,10 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -662,6 +703,9 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -681,4 +725,8 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } diff --git a/crypto_kem/kyber512/aarch64/fips202x2.h b/crypto_kem/kyber512/aarch64/fips202x2.h index 14ceb782..3066c52b 100644 --- a/crypto_kem/kyber512/aarch64/fips202x2.h +++ b/crypto_kem/kyber512/aarch64/fips202x2.h @@ -8,9 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" -#include #include +#include typedef uint64x2_t v128; @@ -23,31 +22,26 @@ typedef struct { v128 s[25]; } keccakx2_state; -#define shake128x2_absorb KYBER_NAMESPACE(shake128x2_absorb) void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake128x2_squeezeblocks KYBER_NAMESPACE(shake128x2_squeezeblocks) void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake256x2_absorb KYBER_NAMESPACE(shake256x2_absorb) void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake256x2_squeezeblocks KYBER_NAMESPACE(shake256x2_squeezeblocks) void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake128x2 KYBER_NAMESPACE(shake128x2) void shake128x2(uint8_t *out0, uint8_t *out1, size_t outlen, @@ -55,7 +49,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in1, size_t inlen); -#define shake256x2 KYBER_NAMESPACE(shake256x2) void shake256x2(uint8_t *out0, uint8_t *out1, size_t outlen, diff --git a/crypto_kem/kyber512/aarch64/indcpa.c b/crypto_kem/kyber512/aarch64/indcpa.c index bff6b3bf..0bca9a64 100644 --- a/crypto_kem/kyber512/aarch64/indcpa.c +++ b/crypto_kem/kyber512/aarch64/indcpa.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -161,6 +162,7 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; neon_xof_state state; + #if KYBER_K == 2 for (unsigned int i = 0; i < KYBER_K; i++) { if (transposed) { neon_xof_absorb(&state, seed, i, i, 0, 1); @@ -187,6 +189,129 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); } } + #elif KYBER_K == 3 + int16_t *s1 = NULL, *s2 = NULL; + unsigned int x1, x2, y1, y2; + xof_state c_state; + + for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2) { + switch (j) { + case 0: + s1 = &(a[0][0][0]); + s2 = &(a[0][1][0]); + x1 = 0; + y1 = 0; + x2 = 0; + y2 = 1; + break; + case 2: + s1 = &(a[0][2][0]); + s2 = &(a[1][0][0]); + x1 = 0; + y1 = 2; + x2 = 1; + y2 = 0; + break; + case 4: + s1 = &(a[1][1][0]); + s2 = &(a[1][2][0]); + x1 = 1; + y1 = 1; + x2 = 1; + y2 = 2; + break; + default: + s1 = &(a[2][0][0]); + s2 = &(a[2][1][0]); + x1 = 2; + y1 = 0; + x2 = 2; + y2 = 1; + break; + } + + if (transposed) { + neon_xof_absorb(&state, seed, x1, x2, y1, y2); + } else { + neon_xof_absorb(&state, seed, y1, y2, x1, x2); + } + + neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); + + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + + ctr0 = neon_rej_uniform(s1, buf0); + ctr1 = neon_rej_uniform(s2, buf1); + + while (ctr0 < KYBER_N || ctr1 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + buf1[k] = buf1[buflen - off + k]; + } + neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen); + ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen); + } + } + + // Last iteration [2][2] + if (transposed) { + xof_absorb(&c_state, seed, 2, 2); + } else { + xof_absorb(&c_state, seed, 2, 2); + } + + xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state); + + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + + ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0); + + while (ctr0 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + } + xof_squeezeblocks(buf0 + off, 1, &c_state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); + } + + #elif KYBER_K == 4 + for (unsigned int i = 0; i < KYBER_K; i++) { + for (unsigned int j = 0; j < KYBER_K; j += 2) { + if (transposed) { + neon_xof_absorb(&state, seed, i, i, j, j + 1); + } else { + neon_xof_absorb(&state, seed, j, j + 1, i, i); + } + + neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0); + ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1); + + while (ctr0 < KYBER_N || ctr1 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + buf1[k] = buf1[buflen - off + k]; + } + neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); + ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); + } + } + } + #else +#error "KYBER_K must be in {2,3,4}" + #endif } /************************************************* @@ -201,8 +326,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]) { + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; uint8_t buf[2 * KYBER_SYMBYTES]; const uint8_t *publicseed = buf; @@ -217,8 +342,19 @@ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], gen_a(a, publicseed); + #if KYBER_K == 2 neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3); + #elif KYBER_K == 3 + neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); + neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3); + neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5); + #elif KYBER_K == 4 + neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); + neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3); + neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5); + neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7); + #endif neon_polyvec_ntt(skpv); neon_polyvec_ntt(e); @@ -274,10 +410,32 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); + #if KYBER_K == 2 // ETA1 != ETA2 (3 != 2) neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3); neon_poly_getnoise_eta2(&(epp[0]), coins, 4); + #elif KYBER_K == 3 + #if KYBER_ETA1 == KYBER_ETA2 + // Because ETA1 == ETA2 + neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); + neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3); + neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5); + neon_poly_getnoise_eta2(&(epp[0]), coins, 6); + #else +#error "We need eta1 == eta2 here" + #endif + #elif KYBER_K == 4 + #if KYBER_ETA1 == KYBER_ETA2 + neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); + neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3); + neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5); + neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7); + neon_poly_getnoise_eta2(&(epp[0]), coins, 8); + #else +#error "We need eta1 == eta2 here" + #endif + #endif neon_polyvec_ntt(sp); diff --git a/crypto_kem/kyber512/aarch64/indcpa.h b/crypto_kem/kyber512/aarch64/indcpa.h index f93487a3..30608327 100644 --- a/crypto_kem/kyber512/aarch64/indcpa.h +++ b/crypto_kem/kyber512/aarch64/indcpa.h @@ -7,16 +7,16 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "polyvec.h" -#include #define gen_matrix KYBER_NAMESPACE(gen_matrix) void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); #define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand) void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]); + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]); #define indcpa_enc KYBER_NAMESPACE(indcpa_enc) void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], diff --git a/crypto_kem/kyber512/aarch64/kem.c b/crypto_kem/kyber512/aarch64/kem.c index 670a4c59..a71d5ac6 100644 --- a/crypto_kem/kyber512/aarch64/kem.c +++ b/crypto_kem/kyber512/aarch64/kem.c @@ -8,12 +8,15 @@ #include #include #include + +#include "api.h" #include "params.h" +#include "kem.h" #include "indcpa.h" #include "verify.h" #include "symmetric.h" #include "randombytes.h" -#include "kem.h" + /************************************************* * Name: crypto_kem_keypair_derand @@ -31,8 +34,8 @@ * Returns 0 (success) **************************************************/ int crypto_kem_keypair_derand(uint8_t *pk, - uint8_t *sk, - const uint8_t *coins) { + uint8_t *sk, + const uint8_t *coins) { indcpa_keypair_derand(pk, sk, coins); memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES); hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); @@ -62,6 +65,8 @@ int crypto_kem_keypair(uint8_t *pk, return 0; } + + /************************************************* * Name: crypto_kem_enc_derand * @@ -80,9 +85,9 @@ int crypto_kem_keypair(uint8_t *pk, * Returns 0 (success) **************************************************/ int crypto_kem_enc_derand(uint8_t *ct, - uint8_t *ss, - const uint8_t *pk, - const uint8_t *coins) { + uint8_t *ss, + const uint8_t *pk, + const uint8_t *coins) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber512/aarch64/kem.h b/crypto_kem/kyber512/aarch64/kem.h index 8b730b6c..afb78598 100644 --- a/crypto_kem/kyber512/aarch64/kem.h +++ b/crypto_kem/kyber512/aarch64/kem.h @@ -7,15 +7,16 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" -#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES -#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES -#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES -#define CRYPTO_BYTES KYBER_SSBYTES - +#if (KYBER_K == 2) #define CRYPTO_ALGNAME "Kyber512" +#elif (KYBER_K == 3) +#define CRYPTO_ALGNAME "Kyber768" +#elif (KYBER_K == 4) +#define CRYPTO_ALGNAME "Kyber1024" +#endif #define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand) int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins); @@ -33,3 +34,4 @@ int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif + diff --git a/crypto_kem/kyber512/aarch64/macros.inc b/crypto_kem/kyber512/aarch64/macros.inc index 2add309e..5504405c 100644 --- a/crypto_kem/kyber512/aarch64/macros.inc +++ b/crypto_kem/kyber512/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,11 +28,114 @@ * SOFTWARE. */ -#ifndef MACROS_S -#define MACROS_S - #include "macros_common.inc" +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + .macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H .endm @@ -52,16 +158,68 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + .macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm -.macro qo_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm -.macro qo_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H .endm @@ -69,18 +227,176 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + .macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm -.macro do_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H .endm -.macro do_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm -#endif +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== + +.macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 +.endm + +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm +// === 32-bit end ==== diff --git a/crypto_kem/kyber512/aarch64/macros_common.inc b/crypto_kem/kyber512/aarch64/macros_common.inc index c1ac021c..07568491 100644 --- a/crypto_kem/kyber512/aarch64/macros_common.inc +++ b/crypto_kem/kyber512/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,31 +35,51 @@ .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -75,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -85,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -102,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -138,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -152,34 +310,340 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] mul \t4\wX, \b4\wX, \z4\nX[\h4] sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] mul \t5\wX, \b5\wX, \z5\nX[\h5] sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] mul \t6\wX, \b6\wX, \z6\nX[\h6] sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] mul \t7\wX, \b7\wX, \z7\nX[\h7] + ldr \d0, [\srcd_ptr, \memd0] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] add \a2\wX, \a2\wX, \t2\wX sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] add \a3\wX, \a3\wX, \t3\wX sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -221,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -231,7 +769,53 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D + +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX @@ -248,15 +832,77 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + ldr \c0, [\srcc_ptr, \memc0] mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] mul \t1\wX, \b1\wX, \h1\wX sub \b3\wX, \a3\wX, \t3\wX + ldr \c2, [\srcc_ptr, \memc2] sqrdmulh \b0\wX, \b0\wX, \l0\wX add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] sqrdmulh \b1\wX, \b1\wX, \l1\wX add \a3\wX, \a3\wX, \t3\wX @@ -269,53 +915,53 @@ .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -394,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX diff --git a/crypto_kem/kyber512/aarch64/neon_poly.c b/crypto_kem/kyber512/aarch64/neon_poly.c index 1d9efe85..1bb1fa7c 100644 --- a/crypto_kem/kyber512/aarch64/neon_poly.c +++ b/crypto_kem/kyber512/aarch64/neon_poly.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -129,14 +130,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER512_AARCH64_asm_add_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER512_AARCH64__asm_add_reduce(int16_t *, const int16_t *); void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER512_AARCH64_asm_add_reduce(c, a); + PQCLEAN_KYBER512_AARCH64__asm_add_reduce(c, a); } -extern void PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); +extern void PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) { - PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(c, a, b); + PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(c, a, b); } /************************************************* @@ -150,7 +151,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(int16_t *, const int16_t *); void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(c, a); + PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(c, a); } diff --git a/crypto_kem/kyber512/aarch64/neon_polyvec.c b/crypto_kem/kyber512/aarch64/neon_polyvec.c index c05f59d6..8787fcde 100644 --- a/crypto_kem/kyber512/aarch64/neon_polyvec.c +++ b/crypto_kem/kyber512/aarch64/neon_polyvec.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -83,7 +84,7 @@ void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]) { * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]) { +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { // c = c + a; @@ -91,4 +92,3 @@ void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYB neon_poly_add_reduce(c[i], a[i]); } } - diff --git a/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c b/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c index a5a2e783..aa096294 100644 --- a/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c +++ b/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -88,8 +89,8 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce1, uint8_t nonce2) { unsigned int i; - uint8_t extkey1[KYBER_SYMBYTES + 1 + 15]; - uint8_t extkey2[KYBER_SYMBYTES + 1 + 15]; + uint8_t extkey1[KYBER_SYMBYTES + 1]; + uint8_t extkey2[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey1[i] = key[i]; @@ -99,5 +100,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, extkey1[i] = nonce1; extkey2[i] = nonce2; - shake256x2(out1, out2, outlen, extkey1, extkey2, KYBER_SYMBYTES + 1); + shake256x2(out1, out2, outlen, extkey1, extkey2, sizeof(extkey1)); } diff --git a/crypto_kem/kyber512/aarch64/ntt.c b/crypto_kem/kyber512/aarch64/ntt.c index 8bca765e..69cb756f 100644 --- a/crypto_kem/kyber512/aarch64/ntt.c +++ b/crypto_kem/kyber512/aarch64/ntt.c @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,12 +28,35 @@ * SOFTWARE. */ -#include +#include #include "params.h" #include "ntt.h" -#include "reduce.h" #include "NTT_params.h" +const __attribute__ ((aligned (16)))int16_t asymmetric_const[8] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t constants[16] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, + invNQ1_R3modQ1_prime_half, + invNQ1_R3modQ1_doubleprime, + invNQ1_final_R3modQ1_prime_half, + invNQ1_final_R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { +0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 +}; + +const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { +167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { +0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 +}; + /************************************************* * Name: ntt * diff --git a/crypto_kem/kyber512/aarch64/ntt.h b/crypto_kem/kyber512/aarch64/ntt.h index bbf83610..aceddc54 100644 --- a/crypto_kem/kyber512/aarch64/ntt.h +++ b/crypto_kem/kyber512/aarch64/ntt.h @@ -2,11 +2,14 @@ #define NTT_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,11 +37,6 @@ extern const int16_t zetas[128]; -#define ntt KYBER_NAMESPACE(ntt) -void ntt(int16_t r[256]); -#define invntt KYBER_NAMESPACE(invntt) -void invntt(int16_t r[256]); - extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); @@ -49,38 +47,35 @@ extern void PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended(int16_t *, const in extern void PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); extern void PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); -static const int16_t asymmetric_const[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime -}; - -#define NTT(in) { \ - PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } - -#define iNTT(in) { \ - PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } - -static const int16_t constants[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, - invNQ1_R3modQ1_prime_half, - invNQ1_R3modQ1_doubleprime, - invNQ1_final_R3modQ1_prime_half, - invNQ1_final_R3modQ1_doubleprime -}; - -static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0 -}; - -static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { - 167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175 -}; - -static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0 -}; +extern +const int16_t asymmetric_const[8]; +extern +const int16_t constants[16]; + +extern +const int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; + +extern +const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; + +extern +const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; + + +#define NTT(in) do { \ + PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define ntt KYBER_NAMESPACE(ntt) +void ntt(int16_t r[256]); +#define invntt KYBER_NAMESPACE(invntt) +void invntt(int16_t r[256]); + #endif diff --git a/crypto_kem/kyber512/aarch64/params.h b/crypto_kem/kyber512/aarch64/params.h index 91d415bb..2b741df9 100644 --- a/crypto_kem/kyber512/aarch64/params.h +++ b/crypto_kem/kyber512/aarch64/params.h @@ -7,11 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -//#define KYBER_90S /* Uncomment this if you want the 90S variant */ - #define KYBER_NAMESPACE(s) PQCLEAN_KYBER512_AARCH64_##s -#define KYBER_K 2 +/* Don't change parameters below this line */ #define KYBER_N 256 #define KYBER_Q 3329 @@ -21,6 +19,7 @@ #define KYBER_POLYBYTES 384 #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) +#define KYBER_K 2 #define KYBER_ETA1 3 #define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) diff --git a/crypto_kem/kyber512/aarch64/poly.c b/crypto_kem/kyber512/aarch64/poly.c index 7d5dbe66..9e7abbd0 100644 --- a/crypto_kem/kyber512/aarch64/poly.c +++ b/crypto_kem/kyber512/aarch64/poly.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/kyber/blob/master/ref * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -53,6 +54,7 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N int16_t u; uint8_t t[8]; + #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { // map to positive standard representatives @@ -67,6 +69,25 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N r[3] = t[6] | (t[7] << 4); r += 4; } + #elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + // map to positive standard representatives + u = a[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; + } + + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; + } + #else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" + #endif } /************************************************* @@ -82,11 +103,33 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { unsigned int i; + #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 2; i++) { r[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; r[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; a += 1; } + #elif (KYBER_POLYCOMPRESSEDBYTES == 160) + unsigned int j; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 5) | (a[1] << 3); + t[2] = (a[1] >> 2); + t[3] = (a[1] >> 7) | (a[2] << 1); + t[4] = (a[2] >> 4) | (a[3] << 4); + t[5] = (a[3] >> 1); + t[6] = (a[3] >> 6) | (a[4] << 2); + t[7] = (a[4] >> 3); + a += 5; + + for (j = 0; j < 8; j++) { + r[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; + } + } + #else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" + #endif } /************************************************* @@ -172,6 +215,10 @@ void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]) unsigned int i, j; int16_t mask; + #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) +#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" + #endif + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { mask = -(int16_t)((msg[i] >> j) & 1); diff --git a/crypto_kem/kyber512/aarch64/poly.h b/crypto_kem/kyber512/aarch64/poly.h index 83c35067..ae6bf04d 100644 --- a/crypto_kem/kyber512/aarch64/poly.h +++ b/crypto_kem/kyber512/aarch64/poly.h @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial @@ -30,7 +30,7 @@ void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const int16_t a[KYBER_N]); #define poly_frommsg KYBER_NAMESPACE(poly_frommsg) void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]); #define poly_tomsg KYBER_NAMESPACE(poly_tomsg) -void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t a[KYBER_N]); +void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t r[KYBER_N]); // NEON diff --git a/crypto_kem/kyber512/aarch64/polyvec.c b/crypto_kem/kyber512/aarch64/polyvec.c index d495809e..8907c316 100644 --- a/crypto_kem/kyber512/aarch64/polyvec.c +++ b/crypto_kem/kyber512/aarch64/polyvec.c @@ -19,9 +19,34 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; + #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + for (k = 0; k < 8; k++) { + t[k] = a[i][8 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; + } + + r[ 0] = (t[0] >> 0); + r[ 1] = (t[0] >> 8) | (t[1] << 3); + r[ 2] = (t[1] >> 5) | (t[2] << 6); + r[ 3] = (t[2] >> 2); + r[ 4] = (t[2] >> 10) | (t[3] << 1); + r[ 5] = (t[3] >> 7) | (t[4] << 4); + r[ 6] = (t[4] >> 4) | (t[5] << 7); + r[ 7] = (t[5] >> 1); + r[ 8] = (t[5] >> 9) | (t[6] << 2); + r[ 9] = (t[6] >> 6) | (t[7] << 5); + r[10] = (t[7] >> 3); + r += 11; + } + } + #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -39,6 +64,9 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K r += 5; } } + #else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" + #endif } /************************************************* @@ -54,6 +82,26 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; + #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); + t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); + t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); + t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); + t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); + t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); + t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); + t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); + a += 11; + + for (k = 0; k < 8; k++) { + r[i][8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; + } + } + } + #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -68,6 +116,9 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV } } } + #else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" + #endif } /************************************************* @@ -79,7 +130,7 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV * (needs space for KYBER_POLYVECBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { poly_tobytes(r + i * KYBER_POLYBYTES, a[i]); diff --git a/crypto_kem/kyber512/aarch64/polyvec.h b/crypto_kem/kyber512/aarch64/polyvec.h index 827610d6..69e7db9c 100644 --- a/crypto_kem/kyber512/aarch64/polyvec.h +++ b/crypto_kem/kyber512/aarch64/polyvec.h @@ -7,8 +7,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -34,21 +35,21 @@ * SOFTWARE. */ +#include #include "params.h" #include "poly.h" -#include typedef struct { poly vec[KYBER_K]; } polyvec; #define polyvec_compress KYBER_NAMESPACE(polyvec_compress) -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress) void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes) -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes) void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECBYTES]); @@ -61,6 +62,6 @@ void neon_polyvec_ntt(int16_t r[KYBER_K][KYBER_N]); void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]); #define neon_polyvec_add_reduce KYBER_NAMESPACE(polyvec_add_reduce) -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]); +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]); #endif diff --git a/crypto_kem/kyber512/aarch64/reduce.h b/crypto_kem/kyber512/aarch64/reduce.h index 7d0f8e3b..4a7c3426 100644 --- a/crypto_kem/kyber512/aarch64/reduce.h +++ b/crypto_kem/kyber512/aarch64/reduce.h @@ -7,11 +7,11 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" -#define MONT (-1044) // 2^16 mod q -#define QINV (-3327) // q^-1 mod 2^16 +#define MONT -1044 // 2^16 mod q +#define QINV -3327 // q^-1 mod 2^16 #define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce) int16_t montgomery_reduce(int32_t a); diff --git a/crypto_kem/kyber512/aarch64/rejsample.h b/crypto_kem/kyber512/aarch64/rejsample.h index ee9ae85c..7a9fb471 100644 --- a/crypto_kem/kyber512/aarch64/rejsample.h +++ b/crypto_kem/kyber512/aarch64/rejsample.h @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" #define neon_rej_uniform KYBER_NAMESPACE(_neon_rej_uniform) unsigned int neon_rej_uniform(int16_t *r, diff --git a/crypto_kem/kyber512/aarch64/symmetric-shake.c b/crypto_kem/kyber512/aarch64/symmetric-shake.c index 067922ec..14a4c28c 100644 --- a/crypto_kem/kyber512/aarch64/symmetric-shake.c +++ b/crypto_kem/kyber512/aarch64/symmetric-shake.c @@ -55,6 +55,8 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM shake256(out, outlen, extkey, sizeof(extkey)); } + + /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf * diff --git a/crypto_kem/kyber512/aarch64/symmetric.h b/crypto_kem/kyber512/aarch64/symmetric.h index cb9ea69e..2a59b8b8 100644 --- a/crypto_kem/kyber512/aarch64/symmetric.h +++ b/crypto_kem/kyber512/aarch64/symmetric.h @@ -8,9 +8,9 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include #include +#include "params.h" #include "fips202.h" @@ -27,6 +27,7 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf) void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]); + #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) @@ -36,6 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) + // NEON Definition #include "fips202x2.h" @@ -65,3 +67,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, shake128x2_squeezeblocks(OUT0, OUT1, OUTBLOCKS, STATE) #endif /* SYMMETRIC_H */ + + diff --git a/crypto_kem/kyber512/aarch64/verify.h b/crypto_kem/kyber512/aarch64/verify.h index 3b9eca9f..ac78bc35 100644 --- a/crypto_kem/kyber512/aarch64/verify.h +++ b/crypto_kem/kyber512/aarch64/verify.h @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include #include +#include "params.h" #define verify KYBER_NAMESPACE(verify) int verify(const uint8_t *a, const uint8_t *b, size_t len); diff --git a/crypto_kem/kyber768/aarch64/LICENSE b/crypto_kem/kyber768/aarch64/LICENSE index 0e259d42..093b0a7d 100644 --- a/crypto_kem/kyber768/aarch64/LICENSE +++ b/crypto_kem/kyber768/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/crypto_kem/kyber768/aarch64/NTT_params.h b/crypto_kem/kyber768/aarch64/NTT_params.h index d0934820..f2607092 100644 --- a/crypto_kem/kyber768/aarch64/NTT_params.h +++ b/crypto_kem/kyber768/aarch64/NTT_params.h @@ -2,7 +2,9 @@ #define NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -40,27 +42,27 @@ #define invomegaQ1 1175 // R = 2^15 below // RmodQ1 = 2^15 mod^{+-} Q1 -#define RmodQ1 (-522) +#define RmodQ1 -522 // Q1prime = Q1^{-1} mod^{+-} 2^15 -#define Q1prime (-3327) +#define Q1prime -3327 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 3303 // R2modQ1 = 2^16 mod^{+-} Q1 -#define R2modQ1 (-1044) +#define R2modQ1 -1044 // Q1prime2 = -Q1^{-1} mod^{+-} 2^16 #define Q1prime2 3327 // R3modQ1 = -2^32 mod^{+-} Q1 -#define R3modQ1 (-1353) +#define R3modQ1 -1353 // R3modQ1_prime = (R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime (-20552) +#define R3modQ1_prime -20552 // R3modQ1_prime_half = ( (R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime_half (-10276) +#define R3modQ1_prime_half -10276 // R3modQ1_doubleprime (R3modQ1_prime Q1 - (R3modQ1 + Q1)) / 2^16 -#define R3modQ1_doubleprime (-1044) +#define R3modQ1_doubleprime -1044 // invNQ1_R3modQ1 = -NTT_N^{-1} 2^32 mod^{+-} Q1 -#define invNQ1_R3modQ1 (-1441) +#define invNQ1_R3modQ1 -1441 // invNQ1_R3modQ1_prime = (invNQ1_R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 #define invNQ1_R3modQ1_prime 10080 // invNQ1_R3modQ1_prime_half = ( (invNQ1_R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 diff --git a/crypto_kem/kyber768/aarch64/__asm_NTT.S b/crypto_kem/kyber768/aarch64/__asm_NTT.S index eb766164..fe9f9e82 100644 --- a/crypto_kem/kyber768/aarch64/__asm_NTT.S +++ b/crypto_kem/kyber768/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,165 +36,188 @@ PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top: _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top: - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 + push_simd + Q .req w8 + src .req x0 + table .req x1 + counter .req x11 ldrsh Q, [x2, #0] - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 + ldr q0, [table, # 0*16] + ldr q1, [table, # 1*16] + ldr q2, [table, # 2*16] + ldr q3, [table, # 3*16] mov v0.H[0], Q - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr q13, [src, # 9*32] + ldr q15, [src, #11*32] + ldr q17, [src, #13*32] + ldr q19, [src, #15*32] + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32 + + qo_butterfly_mixll \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32, \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixsls \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_botsls \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #9*32, #11*32, #13*32, #15*32, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32 + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) + + qo_butterfly_mixl \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixss \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_botss \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 + .unreq src .unreq table .unreq counter - pop_all - - br lr + pop_simd + ret .align 2 .global PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot @@ -199,13 +225,13 @@ _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top: PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: - push_all - Q .req w20 - BarrettM .req w21 + push_simd + Q .req w8 + BarrettM .req w9 src0 .req x0 src1 .req x1 - table .req x28 - counter .req x19 + table .req x10 + counter .req x11 ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] @@ -215,99 +241,127 @@ _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, # 1*16] + ldr q29, [src1, # 1*16] + ldr q30, [src0, # 3*16] + ldr q31, [src1, # 3*16] + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 mov counter, #3 _ntt_bot_loop: - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + str q28, [src0, # 1*16] + ldr q28, [src0, #(64+1*16)] + str q29, [src1, # 1*16] + ldr q29, [src1, #(64+1*16)] + str q30, [src0, # 3*16] + ldr q30, [src0, #(64+3*16)] + str q31, [src1, # 3*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + str q28, [src0, # 1*16] + str q29, [src1, # 1*16] + str q30, [src0, # 3*16] + str q31, [src1, # 3*16] + + add src0, src0, #64 + add src1, src1, #64 .unreq Q .unreq BarrettM @@ -315,12 +369,9 @@ _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: .unreq src1 .unreq table .unreq counter - pop_all - - br lr - - + pop_simd + ret diff --git a/crypto_kem/kyber768/aarch64/__asm_base_mul.S b/crypto_kem/kyber768/aarch64/__asm_base_mul.S index cc4636a6..f9fed3d3 100644 --- a/crypto_kem/kyber768/aarch64/__asm_base_mul.S +++ b/crypto_kem/kyber768/aarch64/__asm_base_mul.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,44 +47,195 @@ _PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended: ldrsh Q, [x3] - dup v20.8H, Q - - // TODO: unroll this, currently we are using only 16 SIMD registers - mov counter, #4 - _point_mul_extended_loop: - - ld2 { v0.8H, v1.8H}, [src1], #32 - ld2 { v2.8H, v3.8H}, [src1], #32 - ld2 { v4.8H, v5.8H}, [src1], #32 - ld2 { v6.8H, v7.8H}, [src1], #32 + dup v28.8H, Q - ld2 { v8.8H, v9.8H}, [src2ex], #32 - ld2 {v10.8H, v11.8H}, [src2ex], #32 - ld2 {v12.8H, v13.8H}, [src2ex], #32 - ld2 {v14.8H, v15.8H}, [src2ex], #32 + ldr q0, [src1, #0*16] + ldr q1, [src1, #1*16] + ldr q2, [src1, #2*16] + ldr q3, [src1, #3*16] + ldr q4, [src1, #4*16] + ldr q5, [src1, #5*16] + ldr q6, [src1, #6*16] + ldr q7, [src1, #7*16] + + add src1, src1, #8*16 + + uzp2 v1.8H, v0.8H, v1.8H + uzp2 v3.8H, v2.8H, v3.8H + uzp2 v5.8H, v4.8H, v5.8H + uzp2 v7.8H, v6.8H, v7.8H + + ldr q8, [src2ex, #0*16] + ldr q10, [src2ex, #2*16] + ldr q12, [src2ex, #4*16] + ldr q14, [src2ex, #6*16] + ldr q9, [src2ex, #1*16] + ldr q11, [src2ex, #3*16] + ldr q13, [src2ex, #5*16] + ldr q15, [src2ex, #7*16] + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] + sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] + sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] + sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] + sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] + mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] + mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] + mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] + mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q0, [src1, #0*16] + sqrdmulh v16.8H, v17.8H, v8.8H + ldr q1, [src1, #1*16] + sqrdmulh v18.8H, v19.8H, v10.8H + ldr q2, [src1, #2*16] + sqrdmulh v20.8H, v21.8H, v12.8H + ldr q3, [src1, #3*16] + sqrdmulh v22.8H, v23.8H, v14.8H + + ldr q4, [src1, #4*16] + mul v17.8H, v17.8H, v9.8H + uzp2 v1.8H, v0.8H, v1.8H + ldr q5, [src1, #5*16] + mul v19.8H, v19.8H, v11.8H + uzp2 v3.8H, v2.8H, v3.8H + ldr q6, [src1, #6*16] + mul v21.8H, v21.8H, v13.8H + uzp2 v5.8H, v4.8H, v5.8H + ldr q7, [src1, #7*16] + mul v23.8H, v23.8H, v15.8H + uzp2 v7.8H, v6.8H, v7.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v17.8H, v16.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v19.8H, v18.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v21.8H, v20.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v23.8H, v22.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q17, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q19, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q21, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q23, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H - mls v1.8H, v0.8H, v20.8H - mls v3.8H, v2.8H, v20.8H - mls v5.8H, v4.8H, v20.8H - mls v7.8H, v6.8H, v20.8H + add src1, src1, #8*16 - st1 { v1.8H}, [des], #16 - st1 { v3.8H}, [des], #16 - st1 { v5.8H}, [des], #16 - st1 { v7.8H}, [des], #16 + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + sqrdmulh v16.8H, v17.8H, v8.8H + sqrdmulh v18.8H, v19.8H, v10.8H + sqrdmulh v20.8H, v21.8H, v12.8H + sqrdmulh v22.8H, v23.8H, v14.8H + + mul v17.8H, v17.8H, v9.8H + mul v19.8H, v19.8H, v11.8H + mul v21.8H, v21.8H, v13.8H + mul v23.8H, v23.8H, v15.8H + + mls v17.8H, v16.8H, v28.8H + mls v19.8H, v18.8H, v28.8H + mls v21.8H, v20.8H, v28.8H + mls v23.8H, v22.8H, v28.8H + + str q17, [des, #0*16] + str q19, [des, #1*16] + str q21, [des, #2*16] + str q23, [des, #3*16] + + add des, des, #4*16 - sub counter, counter, #1 - cbnz counter, _point_mul_extended_loop .unreq Q .unreq des @@ -90,7 +244,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended: .unreq counter pop_all - br lr + ret .align 2 @@ -100,8 +254,6 @@ PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: push_all - Q .req w28 - Qprime2 .req w27 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -117,8 +269,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] + ldr s4, [x3] add des, x4, #0 @@ -138,94 +289,294 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: add src2asy_3, src2asy_0, #256*3 #endif - dup v28.8H, Q - dup v29.8H, Qprime2 + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - // TODO:interleaving - mov counter, #16 - _asymmetric_mul_loop: + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + // TODO:interleaving + mov counter, #15 + _asymmetric_mul_loop: + + ldr q20, [src1_0, #0*16] + uzp1 v6.8H, v16.8H, v18.8H + ldr q21, [src1_0, #1*16] + uzp1 v7.8H, v17.8H, v19.8H + + ldr q22, [src2_0, #0*16] + mul v6.8H, v6.8H, v4.H[1] + ldr q23, [src2_0, #1*16] + mul v7.8H, v7.8H, v4.H[1] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + smlal v16.4S, v6.4H, v4.H[0] + uzp1 v0.8H, v20.8H, v21.8H + smlal2 v18.4S, v6.8H, v4.H[0] + uzp2 v1.8H, v20.8H, v21.8H + smlal v17.4S, v7.4H, v4.H[0] + uzp1 v2.8H, v22.8H, v23.8H + smlal2 v19.4S, v7.8H, v4.H[0] + uzp2 v3.8H, v22.8H, v23.8H + + ld1 {v28.8H}, [src2asy_0], #16 + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H +#endif + +#else - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - st2 {v24.8H, v25.8H}, [des], #32 +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_loop - .unreq Q - .unreq Qprime2 + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -242,7 +593,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: .unreq counter pop_all - br lr + ret .align 2 @@ -252,10 +603,6 @@ PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: push_all - Q .req w28 - Qprime2 .req w27 - R3 .req w26 - R3p .req w25 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -271,11 +618,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] - - ldrsh R3, [x3, #8] - ldrsh R3p, [x3, #10] + ldr q4, [x3] add des, x4, #0 @@ -295,108 +638,312 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: add src2asy_3, src2asy_0, #256*3 #endif - dup v26.8H, R3 - dup v27.8H, R3p + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - dup v28.8H, Q - dup v29.8H, Qprime2 + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 - // TODO: interleaving - mov counter, #16 - _asymmetric_mul_montgomery_loop: + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + mov counter, #15 + _asymmetric_mul_montgomery_loop: + + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + ldr q20, [src1_0, #0*16] + smlal v16.4S, v6.4H, v4.H[0] + ldr q21, [src1_0, #1*16] + smlal2 v18.4S, v6.8H, v4.H[0] + ldr q22, [src2_0, #0*16] + smlal v17.4S, v7.4H, v4.H[0] + ldr q23, [src2_0, #1*16] + smlal2 v19.4S, v7.8H, v4.H[0] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H + uzp1 v0.8H, v20.8H, v21.8H + sqrdmulh v16.8H, v6.8H, v4.H[4] + uzp2 v1.8H, v20.8H, v21.8H + sqrdmulh v17.8H, v7.8H, v4.H[4] - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + uzp1 v2.8H, v22.8H, v23.8H + mul v6.8H, v6.8H, v4.H[5] + uzp2 v3.8H, v22.8H, v23.8H + mul v7.8H, v7.8H, v4.H[5] - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] - sqrdmulh v16.8H, v24.8H, v26.8H - sqrdmulh v17.8H, v25.8H, v26.8H + st2 { v6.8H, v7.8H}, [des], #32 - mul v24.8H, v24.8H, v27.8H - mul v25.8H, v25.8H, v27.8H + ld1 {v28.8H}, [src2asy_0], #16 - mls v24.8H, v16.8H, v28.8H - mls v25.8H, v17.8H, v28.8H + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H + +#endif - st2 {v24.8H, v25.8H}, [des], #32 +#else + + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H + +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_montgomery_loop - .unreq Q - .unreq Qprime2 - .unreq R3 - .unreq R3p + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + sqrdmulh v16.8H, v6.8H, v4.H[4] + sqrdmulh v17.8H, v7.8H, v4.H[4] + + mul v6.8H, v6.8H, v4.H[5] + mul v7.8H, v7.8H, v4.H[5] + + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -413,7 +960,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: .unreq counter pop_all - br lr + ret diff --git a/crypto_kem/kyber768/aarch64/__asm_iNTT.S b/crypto_kem/kyber768/aarch64/__asm_iNTT.S index 7ddb5925..f3e79823 100644 --- a/crypto_kem/kyber768/aarch64/__asm_iNTT.S +++ b/crypto_kem/kyber768/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -49,57 +52,116 @@ _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - mov counter, #4 + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, #1*16] + ldr q29, [src1, #1*16] + ldr q30, [src0, #3*16] + ldr q31, [src1, #3*16] + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 + + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 + + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 + + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 + + mov counter, #3 _intt_bot_loop: - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM + str q24, [src0, #0*16] + ldr q28, [src0, #(64+1*16)] + str q25, [src1, #0*16] + ldr q29, [src1, #(64+1*16)] + str q26, [src0, #2*16] + ldr q30, [src0, #(64+3*16)] + str q27, [src1, #2*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 - do_butterfly_vec_mixed_rev v28, v30, v18, v19, v29, v31, v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, v8, v9, v10, v11 - do_butterfly_vec_mixed_rev v24, v26, v16, v17, v25, v27, v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, v6, v7, v6, v7 - do_butterfly_vec_mixed_rev v28, v29, v18, v19, v30, v31, v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, v4, v5, v4, v5 - do_butterfly_vec_mixed_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 - do_butterfly_vec_mixed_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 - qo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v1, #11, v0 + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 sub counter, counter, #1 cbnz counter, _intt_bot_loop + str q24, [src0, #0*16] + str q25, [src1, #0*16] + str q26, [src0, #2*16] + str q27, [src1, #2*16] + + .unreq Q .unreq BarrettM .unreq src0 @@ -108,7 +170,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot: .unreq counter pop_all - br lr + ret .align 2 .global PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top @@ -121,245 +183,131 @@ _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top: BarrettM .req w21 invN .req w22 invN_f .req w23 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 + table .req x1 counter .req x19 - ldrsh Q, [x2, #0] + ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] - ldr invN, [x2, #10] - ldr invN_f, [x2, #14] - - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 - - mov v0.H[0], Q - - dup v24.8H, Q - dup v25.8H, BarrettM - - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr invN, [x2, #10] + ldr invN_f, [x2, #14] + + mov v4.S[0], invN + mov v4.S[1], invN_f + + ldr q0, [table, #0*16] + mov v0.H[0], Q + + ldr q1, [table, #1*16] + ldr q2, [table, #2*16] + ldr q3, [table, #3*16] + + ldr q16, [src, # 8*32] + ldr q17, [src, # 9*32] + ldr q18, [src, #10*32] + ldr q19, [src, #11*32] + ldr q20, [src, #12*32] + ldr q21, [src, #13*32] + ldr q22, [src, #14*32] + ldr q23, [src, #15*32] + + qo_butterfly_botll \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q12, q13, q14, q15, \ + #4*32, #5*32, #6*32, #7*32 + + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_topsl \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #8*32, #9*32, #10*32, #11*32, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + qo_montgomery_mul_insl \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #12*32, #13*32, #14*32, #15*32, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_butterfly_botsl_mul \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32), \ + v12, v13, v14, v15, v24, v25, v26, v27, \ + v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0 + + str q12, [src, # 4*32] + ldr q12, [src, #(16+ 4*32)] + str q13, [src, # 5*32] + ldr q13, [src, #(16+ 5*32)] + str q14, [src, # 6*32] + ldr q14, [src, #(16+ 6*32)] + str q15, [src, # 7*32] + ldr q15, [src, #(16+ 7*32)] + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_tops \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + + qo_montgomery_mul_ins \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_montgomery_mul_ins \ + v12, v13, v14, v15, v24, v25, v26, v27, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32) + + str q12, [src, #(16+ 4*32)] + str q13, [src, #(16+ 5*32)] + str q14, [src, #(16+ 6*32)] + str q15, [src, #(16+ 7*32)] .unreq Q .unreq BarrettM .unreq invN .unreq invN_f - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr - - - - + ret diff --git a/crypto_kem/kyber768/aarch64/__asm_poly.S b/crypto_kem/kyber768/aarch64/__asm_poly.S index b934f878..be524b33 100644 --- a/crypto_kem/kyber768/aarch64/__asm_poly.S +++ b/crypto_kem/kyber768/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_KYBER768_AARCH64_asm_add_reduce -.global _PQCLEAN_KYBER768_AARCH64_asm_add_reduce -PQCLEAN_KYBER768_AARCH64_asm_add_reduce: -_PQCLEAN_KYBER768_AARCH64_asm_add_reduce: +.global PQCLEAN_KYBER768_AARCH64__asm_add_reduce +.global _PQCLEAN_KYBER768_AARCH64__asm_add_reduce +PQCLEAN_KYBER768_AARCH64__asm_add_reduce: +_PQCLEAN_KYBER768_AARCH64__asm_add_reduce: mov w4, #3329 mov w5, #25519 @@ -86,13 +89,13 @@ _PQCLEAN_KYBER768_AARCH64_asm_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER768_AARCH64_asm_sub_reduce -.global _PQCLEAN_KYBER768_AARCH64_asm_sub_reduce -PQCLEAN_KYBER768_AARCH64_asm_sub_reduce: -_PQCLEAN_KYBER768_AARCH64_asm_sub_reduce: +.global PQCLEAN_KYBER768_AARCH64__asm_sub_reduce +.global _PQCLEAN_KYBER768_AARCH64__asm_sub_reduce +PQCLEAN_KYBER768_AARCH64__asm_sub_reduce: +_PQCLEAN_KYBER768_AARCH64__asm_sub_reduce: mov w4, #3329 mov w5, #25519 @@ -147,13 +150,13 @@ _PQCLEAN_KYBER768_AARCH64_asm_sub_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce -.global _PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce -PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce: -_PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce: +.global PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce +.global _PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce +PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce: +_PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce: mov w4, #3329 mov w5, #25519 @@ -232,7 +235,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v20.8H, v21.8H, v22.8H, v23.8H}, [x0], #64 - br lr + ret diff --git a/crypto_kem/kyber768/aarch64/api.h b/crypto_kem/kyber768/aarch64/api.h index bb4877ad..39b13746 100644 --- a/crypto_kem/kyber768/aarch64/api.h +++ b/crypto_kem/kyber768/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef PQCLEAN_KYBER768_AARCH64_API_H -#define PQCLEAN_KYBER768_AARCH64_API_H +#ifndef API_H +#define API_H /* * This file is licensed @@ -13,7 +13,7 @@ #define PQCLEAN_KYBER768_AARCH64_CRYPTO_PUBLICKEYBYTES 1184 #define PQCLEAN_KYBER768_AARCH64_CRYPTO_CIPHERTEXTBYTES 1088 #define PQCLEAN_KYBER768_AARCH64_CRYPTO_BYTES 32 -#define PQCLEAN_KYBER768_AARCH64_CRYPTO_ALGNAME "Kyber768" +#define PQCLEAN_KYBER768_AARCH64_CRYPTO_ALGNAME "Kyber768" int PQCLEAN_KYBER768_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); diff --git a/crypto_kem/kyber768/aarch64/cbd.c b/crypto_kem/kyber768/aarch64/cbd.c index 6ae95c03..a96d0516 100644 --- a/crypto_kem/kyber768/aarch64/cbd.c +++ b/crypto_kem/kyber768/aarch64/cbd.c @@ -127,6 +127,15 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) **************************************************/ +#if KYBER_ETA1 == 3 +static uint32_t load24_littleendian(const uint8_t x[3]) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} +#endif /************************************************* * Name: cbd3 @@ -139,11 +148,41 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ +#if KYBER_ETA1 == 3 +static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { + unsigned int i, j; + uint32_t t, d; + int16_t a, b; + + for (i = 0; i < KYBER_N / 4; i++) { + t = load24_littleendian(buf + 3 * i); + d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) { + a = (d >> (6 * j + 0)) & 0x7; + b = (d >> (6 * j + 3)) & 0x7; + r[4 * i + j] = a - b; + } + } +} +#endif void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { + #if KYBER_ETA1 == 2 neon_cbd2(r, buf); + #elif KYBER_ETA1 == 3 + cbd3(r, buf); + #else +#error "This implementation requires eta1 in {2,3}" + #endif } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { + #if KYBER_ETA2 == 2 neon_cbd2(r, buf); + #else +#error "This implementation requires eta2 = 2" + #endif } diff --git a/crypto_kem/kyber768/aarch64/cbd.h b/crypto_kem/kyber768/aarch64/cbd.h index 47a06806..688abf43 100644 --- a/crypto_kem/kyber768/aarch64/cbd.h +++ b/crypto_kem/kyber768/aarch64/cbd.h @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "poly.h" -#include #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1) void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); diff --git a/crypto_kem/kyber768/aarch64/feat.S b/crypto_kem/kyber768/aarch64/feat.S index ce72974b..f467fa80 100644 --- a/crypto_kem/kyber768/aarch64/feat.S +++ b/crypto_kem/kyber768/aarch64/feat.S @@ -123,10 +123,8 @@ SOFTWARE. .endm .align 4 -.global PQCLEAN_KYBER768_AARCH64_f1600x2 -.global _PQCLEAN_KYBER768_AARCH64_f1600x2 -PQCLEAN_KYBER768_AARCH64_f1600x2: -_PQCLEAN_KYBER768_AARCH64_f1600x2: +.global _f1600x2 +_f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! diff --git a/crypto_kem/kyber768/aarch64/fips202x2.c b/crypto_kem/kyber768/aarch64/fips202x2.c index 77e1945d..e045ee3d 100644 --- a/crypto_kem/kyber768/aarch64/fips202x2.c +++ b/crypto_kem/kyber768/aarch64/fips202x2.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -36,6 +37,11 @@ #include #include "fips202x2.h" +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + #define NROUNDS 24 // Define NEON operation @@ -47,20 +53,20 @@ #define vxor(c, a, b) c = veorq_u64(a, b); // Rotate by n bit ((a << offset) ^ (a >> (64-offset))) #define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); // Xor chain: out = a ^ b ^ c ^ d ^ e #define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); // Not And c = ~a & b // #define vbic(c, a, b) c = vbicq_u64(b, a); // Xor Not And: out = a ^ ( (~b) & c) #define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); // Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support #define vrxor(c, a, b) c = vrax1q_u64(a, b); // End Define @@ -100,11 +106,11 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { * * Arguments: - uint64_t *state: pointer to input/output Keccak state **************************************************/ -extern void PQCLEAN_KYBER768_AARCH64_f1600x2(v128 *, const uint64_t *); +extern void f1600x2(v128 *, const uint64_t *); static inline void KeccakF1600_StatePermutex2(v128 state[25]) { #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_KYBER768_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); + f1600x2(state, neon_KeccakF_RoundConstants); #else v128 Aba, Abe, Abi, Abo, Abu; v128 Aga, Age, Agi, Ago, Agu; @@ -551,7 +557,14 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -570,7 +583,14 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -587,7 +607,14 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -606,7 +633,14 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -625,6 +659,9 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -644,6 +681,10 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -662,6 +703,9 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -681,4 +725,8 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } diff --git a/crypto_kem/kyber768/aarch64/fips202x2.h b/crypto_kem/kyber768/aarch64/fips202x2.h index 14ceb782..3066c52b 100644 --- a/crypto_kem/kyber768/aarch64/fips202x2.h +++ b/crypto_kem/kyber768/aarch64/fips202x2.h @@ -8,9 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" -#include #include +#include typedef uint64x2_t v128; @@ -23,31 +22,26 @@ typedef struct { v128 s[25]; } keccakx2_state; -#define shake128x2_absorb KYBER_NAMESPACE(shake128x2_absorb) void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake128x2_squeezeblocks KYBER_NAMESPACE(shake128x2_squeezeblocks) void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake256x2_absorb KYBER_NAMESPACE(shake256x2_absorb) void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake256x2_squeezeblocks KYBER_NAMESPACE(shake256x2_squeezeblocks) void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake128x2 KYBER_NAMESPACE(shake128x2) void shake128x2(uint8_t *out0, uint8_t *out1, size_t outlen, @@ -55,7 +49,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in1, size_t inlen); -#define shake256x2 KYBER_NAMESPACE(shake256x2) void shake256x2(uint8_t *out0, uint8_t *out1, size_t outlen, diff --git a/crypto_kem/kyber768/aarch64/indcpa.c b/crypto_kem/kyber768/aarch64/indcpa.c index 3e571c50..d9c60ffa 100644 --- a/crypto_kem/kyber768/aarch64/indcpa.c +++ b/crypto_kem/kyber768/aarch64/indcpa.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -161,6 +162,34 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; neon_xof_state state; + #if KYBER_K == 2 + for (unsigned int i = 0; i < KYBER_K; i++) { + if (transposed) { + neon_xof_absorb(&state, seed, i, i, 0, 1); + } else { + neon_xof_absorb(&state, seed, 0, 1, i, i); + } + + neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); + + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + + ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0); + ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1); + while (ctr0 < KYBER_N || ctr1 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + buf1[k] = buf1[buflen - off + k]; + } + neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); + ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); + } + } + #elif KYBER_K == 3 int16_t *s1 = NULL, *s2 = NULL; unsigned int x1, x2, y1, y2; xof_state c_state; @@ -251,8 +280,38 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buflen = off + XOF_BLOCKBYTES; ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); } - shake128_ctx_release(&c_state); + #elif KYBER_K == 4 + for (unsigned int i = 0; i < KYBER_K; i++) { + for (unsigned int j = 0; j < KYBER_K; j += 2) { + if (transposed) { + neon_xof_absorb(&state, seed, i, i, j, j + 1); + } else { + neon_xof_absorb(&state, seed, j, j + 1, i, i); + } + + neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); + buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; + ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0); + ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1); + + while (ctr0 < KYBER_N || ctr1 < KYBER_N) { + off = buflen % 3; + for (k = 0; k < off; k++) { + buf0[k] = buf0[buflen - off + k]; + buf1[k] = buf1[buflen - off + k]; + } + neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); + + buflen = off + XOF_BLOCKBYTES; + ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); + ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); + } + } + } + #else +#error "KYBER_K must be in {2,3,4}" + #endif } /************************************************* @@ -267,8 +326,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]) { + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; uint8_t buf[2 * KYBER_SYMBYTES]; const uint8_t *publicseed = buf; @@ -283,9 +342,19 @@ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], gen_a(a, publicseed); + #if KYBER_K == 2 + neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); + neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3); + #elif KYBER_K == 3 neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3); neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5); + #elif KYBER_K == 4 + neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); + neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3); + neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5); + neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7); + #endif neon_polyvec_ntt(skpv); neon_polyvec_ntt(e); @@ -341,11 +410,32 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); + #if KYBER_K == 2 + // ETA1 != ETA2 (3 != 2) + neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); + neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3); + neon_poly_getnoise_eta2(&(epp[0]), coins, 4); + #elif KYBER_K == 3 + #if KYBER_ETA1 == KYBER_ETA2 // Because ETA1 == ETA2 neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3); neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5); neon_poly_getnoise_eta2(&(epp[0]), coins, 6); + #else +#error "We need eta1 == eta2 here" + #endif + #elif KYBER_K == 4 + #if KYBER_ETA1 == KYBER_ETA2 + neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); + neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3); + neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5); + neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7); + neon_poly_getnoise_eta2(&(epp[0]), coins, 8); + #else +#error "We need eta1 == eta2 here" + #endif + #endif neon_polyvec_ntt(sp); diff --git a/crypto_kem/kyber768/aarch64/indcpa.h b/crypto_kem/kyber768/aarch64/indcpa.h index f93487a3..30608327 100644 --- a/crypto_kem/kyber768/aarch64/indcpa.h +++ b/crypto_kem/kyber768/aarch64/indcpa.h @@ -7,16 +7,16 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "polyvec.h" -#include #define gen_matrix KYBER_NAMESPACE(gen_matrix) void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); #define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand) void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]); + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]); #define indcpa_enc KYBER_NAMESPACE(indcpa_enc) void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], diff --git a/crypto_kem/kyber768/aarch64/kem.c b/crypto_kem/kyber768/aarch64/kem.c index 670a4c59..a71d5ac6 100644 --- a/crypto_kem/kyber768/aarch64/kem.c +++ b/crypto_kem/kyber768/aarch64/kem.c @@ -8,12 +8,15 @@ #include #include #include + +#include "api.h" #include "params.h" +#include "kem.h" #include "indcpa.h" #include "verify.h" #include "symmetric.h" #include "randombytes.h" -#include "kem.h" + /************************************************* * Name: crypto_kem_keypair_derand @@ -31,8 +34,8 @@ * Returns 0 (success) **************************************************/ int crypto_kem_keypair_derand(uint8_t *pk, - uint8_t *sk, - const uint8_t *coins) { + uint8_t *sk, + const uint8_t *coins) { indcpa_keypair_derand(pk, sk, coins); memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES); hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); @@ -62,6 +65,8 @@ int crypto_kem_keypair(uint8_t *pk, return 0; } + + /************************************************* * Name: crypto_kem_enc_derand * @@ -80,9 +85,9 @@ int crypto_kem_keypair(uint8_t *pk, * Returns 0 (success) **************************************************/ int crypto_kem_enc_derand(uint8_t *ct, - uint8_t *ss, - const uint8_t *pk, - const uint8_t *coins) { + uint8_t *ss, + const uint8_t *pk, + const uint8_t *coins) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber768/aarch64/kem.h b/crypto_kem/kyber768/aarch64/kem.h index 3bedcd28..afb78598 100644 --- a/crypto_kem/kyber768/aarch64/kem.h +++ b/crypto_kem/kyber768/aarch64/kem.h @@ -7,15 +7,16 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" -#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES -#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES -#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES -#define CRYPTO_BYTES KYBER_SSBYTES - +#if (KYBER_K == 2) +#define CRYPTO_ALGNAME "Kyber512" +#elif (KYBER_K == 3) #define CRYPTO_ALGNAME "Kyber768" +#elif (KYBER_K == 4) +#define CRYPTO_ALGNAME "Kyber1024" +#endif #define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand) int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins); @@ -33,3 +34,4 @@ int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif + diff --git a/crypto_kem/kyber768/aarch64/macros.inc b/crypto_kem/kyber768/aarch64/macros.inc index 2add309e..5504405c 100644 --- a/crypto_kem/kyber768/aarch64/macros.inc +++ b/crypto_kem/kyber768/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,11 +28,114 @@ * SOFTWARE. */ -#ifndef MACROS_S -#define MACROS_S - #include "macros_common.inc" +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + .macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H .endm @@ -52,16 +158,68 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + .macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm -.macro qo_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm -.macro qo_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H .endm @@ -69,18 +227,176 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + .macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm -.macro do_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H .endm -.macro do_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm -#endif +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== + +.macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 +.endm + +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm +// === 32-bit end ==== diff --git a/crypto_kem/kyber768/aarch64/macros_common.inc b/crypto_kem/kyber768/aarch64/macros_common.inc index c1ac021c..07568491 100644 --- a/crypto_kem/kyber768/aarch64/macros_common.inc +++ b/crypto_kem/kyber768/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,31 +35,51 @@ .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -75,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -85,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -102,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -138,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -152,34 +310,340 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] mul \t4\wX, \b4\wX, \z4\nX[\h4] sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] mul \t5\wX, \b5\wX, \z5\nX[\h5] sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] mul \t6\wX, \b6\wX, \z6\nX[\h6] sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] mul \t7\wX, \b7\wX, \z7\nX[\h7] + ldr \d0, [\srcd_ptr, \memd0] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] add \a2\wX, \a2\wX, \t2\wX sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] add \a3\wX, \a3\wX, \t3\wX sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -221,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -231,7 +769,53 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D + +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX @@ -248,15 +832,77 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + ldr \c0, [\srcc_ptr, \memc0] mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] mul \t1\wX, \b1\wX, \h1\wX sub \b3\wX, \a3\wX, \t3\wX + ldr \c2, [\srcc_ptr, \memc2] sqrdmulh \b0\wX, \b0\wX, \l0\wX add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] sqrdmulh \b1\wX, \b1\wX, \l1\wX add \a3\wX, \a3\wX, \t3\wX @@ -269,53 +915,53 @@ .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -394,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX diff --git a/crypto_kem/kyber768/aarch64/neon_poly.c b/crypto_kem/kyber768/aarch64/neon_poly.c index 03e7a329..bd824144 100644 --- a/crypto_kem/kyber768/aarch64/neon_poly.c +++ b/crypto_kem/kyber768/aarch64/neon_poly.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -129,14 +130,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER768_AARCH64_asm_add_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER768_AARCH64__asm_add_reduce(int16_t *, const int16_t *); void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER768_AARCH64_asm_add_reduce(c, a); + PQCLEAN_KYBER768_AARCH64__asm_add_reduce(c, a); } -extern void PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); +extern void PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) { - PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(c, a, b); + PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(c, a, b); } /************************************************* @@ -150,7 +151,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(int16_t *, const int16_t *); void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(c, a); + PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(c, a); } diff --git a/crypto_kem/kyber768/aarch64/neon_polyvec.c b/crypto_kem/kyber768/aarch64/neon_polyvec.c index c05f59d6..8787fcde 100644 --- a/crypto_kem/kyber768/aarch64/neon_polyvec.c +++ b/crypto_kem/kyber768/aarch64/neon_polyvec.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -83,7 +84,7 @@ void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]) { * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]) { +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { // c = c + a; @@ -91,4 +92,3 @@ void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYB neon_poly_add_reduce(c[i], a[i]); } } - diff --git a/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c b/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c index a5a2e783..aa096294 100644 --- a/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c +++ b/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -88,8 +89,8 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce1, uint8_t nonce2) { unsigned int i; - uint8_t extkey1[KYBER_SYMBYTES + 1 + 15]; - uint8_t extkey2[KYBER_SYMBYTES + 1 + 15]; + uint8_t extkey1[KYBER_SYMBYTES + 1]; + uint8_t extkey2[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey1[i] = key[i]; @@ -99,5 +100,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, extkey1[i] = nonce1; extkey2[i] = nonce2; - shake256x2(out1, out2, outlen, extkey1, extkey2, KYBER_SYMBYTES + 1); + shake256x2(out1, out2, outlen, extkey1, extkey2, sizeof(extkey1)); } diff --git a/crypto_kem/kyber768/aarch64/ntt.c b/crypto_kem/kyber768/aarch64/ntt.c index 8bca765e..69cb756f 100644 --- a/crypto_kem/kyber768/aarch64/ntt.c +++ b/crypto_kem/kyber768/aarch64/ntt.c @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,12 +28,35 @@ * SOFTWARE. */ -#include +#include #include "params.h" #include "ntt.h" -#include "reduce.h" #include "NTT_params.h" +const __attribute__ ((aligned (16)))int16_t asymmetric_const[8] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t constants[16] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, + invNQ1_R3modQ1_prime_half, + invNQ1_R3modQ1_doubleprime, + invNQ1_final_R3modQ1_prime_half, + invNQ1_final_R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { +0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 +}; + +const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { +167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { +0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 +}; + /************************************************* * Name: ntt * diff --git a/crypto_kem/kyber768/aarch64/ntt.h b/crypto_kem/kyber768/aarch64/ntt.h index 90e1c61d..3ed9cdcf 100644 --- a/crypto_kem/kyber768/aarch64/ntt.h +++ b/crypto_kem/kyber768/aarch64/ntt.h @@ -2,11 +2,14 @@ #define NTT_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,11 +37,6 @@ extern const int16_t zetas[128]; -#define ntt KYBER_NAMESPACE(ntt) -void ntt(int16_t r[256]); -#define invntt KYBER_NAMESPACE(invntt) -void invntt(int16_t r[256]); - extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); @@ -49,38 +47,35 @@ extern void PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended(int16_t *, const in extern void PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); extern void PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); -static const int16_t asymmetric_const[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime -}; - -#define NTT(in) { \ - PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } - -#define iNTT(in) { \ - PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } - -static const int16_t constants[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, - invNQ1_R3modQ1_prime_half, - invNQ1_R3modQ1_doubleprime, - invNQ1_final_R3modQ1_prime_half, - invNQ1_final_R3modQ1_doubleprime -}; - -static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0 -}; - -static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { - 167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175 -}; - -static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0 -}; +extern +const int16_t asymmetric_const[8]; +extern +const int16_t constants[16]; + +extern +const int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; + +extern +const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; + +extern +const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; + + +#define NTT(in) do { \ + PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define ntt KYBER_NAMESPACE(ntt) +void ntt(int16_t r[256]); +#define invntt KYBER_NAMESPACE(invntt) +void invntt(int16_t r[256]); + #endif diff --git a/crypto_kem/kyber768/aarch64/params.h b/crypto_kem/kyber768/aarch64/params.h index 66151ac2..33b314e4 100644 --- a/crypto_kem/kyber768/aarch64/params.h +++ b/crypto_kem/kyber768/aarch64/params.h @@ -9,6 +9,7 @@ #define KYBER_NAMESPACE(s) PQCLEAN_KYBER768_AARCH64_##s +/* Don't change parameters below this line */ #define KYBER_N 256 #define KYBER_Q 3329 diff --git a/crypto_kem/kyber768/aarch64/poly.c b/crypto_kem/kyber768/aarch64/poly.c index 7d5dbe66..9e7abbd0 100644 --- a/crypto_kem/kyber768/aarch64/poly.c +++ b/crypto_kem/kyber768/aarch64/poly.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/kyber/blob/master/ref * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -53,6 +54,7 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N int16_t u; uint8_t t[8]; + #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { // map to positive standard representatives @@ -67,6 +69,25 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N r[3] = t[6] | (t[7] << 4); r += 4; } + #elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + // map to positive standard representatives + u = a[8 * i + j]; + u += (u >> 15) & KYBER_Q; + t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; + } + + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; + } + #else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" + #endif } /************************************************* @@ -82,11 +103,33 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { unsigned int i; + #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 2; i++) { r[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; r[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; a += 1; } + #elif (KYBER_POLYCOMPRESSEDBYTES == 160) + unsigned int j; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 5) | (a[1] << 3); + t[2] = (a[1] >> 2); + t[3] = (a[1] >> 7) | (a[2] << 1); + t[4] = (a[2] >> 4) | (a[3] << 4); + t[5] = (a[3] >> 1); + t[6] = (a[3] >> 6) | (a[4] << 2); + t[7] = (a[4] >> 3); + a += 5; + + for (j = 0; j < 8; j++) { + r[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; + } + } + #else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" + #endif } /************************************************* @@ -172,6 +215,10 @@ void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]) unsigned int i, j; int16_t mask; + #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) +#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" + #endif + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { mask = -(int16_t)((msg[i] >> j) & 1); diff --git a/crypto_kem/kyber768/aarch64/poly.h b/crypto_kem/kyber768/aarch64/poly.h index 83c35067..ae6bf04d 100644 --- a/crypto_kem/kyber768/aarch64/poly.h +++ b/crypto_kem/kyber768/aarch64/poly.h @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial @@ -30,7 +30,7 @@ void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const int16_t a[KYBER_N]); #define poly_frommsg KYBER_NAMESPACE(poly_frommsg) void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]); #define poly_tomsg KYBER_NAMESPACE(poly_tomsg) -void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t a[KYBER_N]); +void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t r[KYBER_N]); // NEON diff --git a/crypto_kem/kyber768/aarch64/polyvec.c b/crypto_kem/kyber768/aarch64/polyvec.c index d495809e..8907c316 100644 --- a/crypto_kem/kyber768/aarch64/polyvec.c +++ b/crypto_kem/kyber768/aarch64/polyvec.c @@ -19,9 +19,34 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; + #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + for (k = 0; k < 8; k++) { + t[k] = a[i][8 * j + k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; + } + + r[ 0] = (t[0] >> 0); + r[ 1] = (t[0] >> 8) | (t[1] << 3); + r[ 2] = (t[1] >> 5) | (t[2] << 6); + r[ 3] = (t[2] >> 2); + r[ 4] = (t[2] >> 10) | (t[3] << 1); + r[ 5] = (t[3] >> 7) | (t[4] << 4); + r[ 6] = (t[4] >> 4) | (t[5] << 7); + r[ 7] = (t[5] >> 1); + r[ 8] = (t[5] >> 9) | (t[6] << 2); + r[ 9] = (t[6] >> 6) | (t[7] << 5); + r[10] = (t[7] >> 3); + r += 11; + } + } + #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -39,6 +64,9 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K r += 5; } } + #else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" + #endif } /************************************************* @@ -54,6 +82,26 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; + #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); + t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); + t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); + t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); + t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); + t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); + t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); + t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); + a += 11; + + for (k = 0; k < 8; k++) { + r[i][8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; + } + } + } + #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -68,6 +116,9 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV } } } + #else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" + #endif } /************************************************* @@ -79,7 +130,7 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV * (needs space for KYBER_POLYVECBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { poly_tobytes(r + i * KYBER_POLYBYTES, a[i]); diff --git a/crypto_kem/kyber768/aarch64/polyvec.h b/crypto_kem/kyber768/aarch64/polyvec.h index 827610d6..69e7db9c 100644 --- a/crypto_kem/kyber768/aarch64/polyvec.h +++ b/crypto_kem/kyber768/aarch64/polyvec.h @@ -7,8 +7,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -34,21 +35,21 @@ * SOFTWARE. */ +#include #include "params.h" #include "poly.h" -#include typedef struct { poly vec[KYBER_K]; } polyvec; #define polyvec_compress KYBER_NAMESPACE(polyvec_compress) -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress) void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes) -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes) void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECBYTES]); @@ -61,6 +62,6 @@ void neon_polyvec_ntt(int16_t r[KYBER_K][KYBER_N]); void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]); #define neon_polyvec_add_reduce KYBER_NAMESPACE(polyvec_add_reduce) -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]); +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]); #endif diff --git a/crypto_kem/kyber768/aarch64/reduce.h b/crypto_kem/kyber768/aarch64/reduce.h index 7d0f8e3b..4a7c3426 100644 --- a/crypto_kem/kyber768/aarch64/reduce.h +++ b/crypto_kem/kyber768/aarch64/reduce.h @@ -7,11 +7,11 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" -#define MONT (-1044) // 2^16 mod q -#define QINV (-3327) // q^-1 mod 2^16 +#define MONT -1044 // 2^16 mod q +#define QINV -3327 // q^-1 mod 2^16 #define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce) int16_t montgomery_reduce(int32_t a); diff --git a/crypto_kem/kyber768/aarch64/rejsample.h b/crypto_kem/kyber768/aarch64/rejsample.h index ee9ae85c..7a9fb471 100644 --- a/crypto_kem/kyber768/aarch64/rejsample.h +++ b/crypto_kem/kyber768/aarch64/rejsample.h @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" #define neon_rej_uniform KYBER_NAMESPACE(_neon_rej_uniform) unsigned int neon_rej_uniform(int16_t *r, diff --git a/crypto_kem/kyber768/aarch64/symmetric-shake.c b/crypto_kem/kyber768/aarch64/symmetric-shake.c index 067922ec..14a4c28c 100644 --- a/crypto_kem/kyber768/aarch64/symmetric-shake.c +++ b/crypto_kem/kyber768/aarch64/symmetric-shake.c @@ -55,6 +55,8 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM shake256(out, outlen, extkey, sizeof(extkey)); } + + /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf * diff --git a/crypto_kem/kyber768/aarch64/symmetric.h b/crypto_kem/kyber768/aarch64/symmetric.h index cb9ea69e..2a59b8b8 100644 --- a/crypto_kem/kyber768/aarch64/symmetric.h +++ b/crypto_kem/kyber768/aarch64/symmetric.h @@ -8,9 +8,9 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include #include +#include "params.h" #include "fips202.h" @@ -27,6 +27,7 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf) void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]); + #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) @@ -36,6 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) + // NEON Definition #include "fips202x2.h" @@ -65,3 +67,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, shake128x2_squeezeblocks(OUT0, OUT1, OUTBLOCKS, STATE) #endif /* SYMMETRIC_H */ + + diff --git a/crypto_kem/kyber768/aarch64/verify.h b/crypto_kem/kyber768/aarch64/verify.h index 3b9eca9f..ac78bc35 100644 --- a/crypto_kem/kyber768/aarch64/verify.h +++ b/crypto_kem/kyber768/aarch64/verify.h @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include #include +#include "params.h" #define verify KYBER_NAMESPACE(verify) int verify(const uint8_t *a, const uint8_t *b, size_t len); From c783e702afff55a439a3f65cf66b5dbf2d46cd25 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 14:02:52 +0100 Subject: [PATCH 61/85] update dilithium aarch64 --- crypto_sign/dilithium2/aarch64/LICENSE | 125 +-- crypto_sign/dilithium2/aarch64/NTT_params.h | 14 +- crypto_sign/dilithium2/aarch64/__asm_NTT.S | 571 +++++++----- crypto_sign/dilithium2/aarch64/__asm_iNTT.S | 485 +++++----- crypto_sign/dilithium2/aarch64/__asm_poly.S | 327 +++---- crypto_sign/dilithium2/aarch64/api.h | 8 +- crypto_sign/dilithium2/aarch64/feat.S | 6 +- crypto_sign/dilithium2/aarch64/fips202x2.c | 70 +- crypto_sign/dilithium2/aarch64/fips202x2.h | 10 +- crypto_sign/dilithium2/aarch64/macros.inc | 339 ++++++- .../dilithium2/aarch64/macros_common.inc | 852 ++++++++++++++++-- crypto_sign/dilithium2/aarch64/ntt.c | 23 +- crypto_sign/dilithium2/aarch64/ntt.h | 58 +- crypto_sign/dilithium2/aarch64/packing.c | 12 +- crypto_sign/dilithium2/aarch64/packing.h | 13 +- crypto_sign/dilithium2/aarch64/params.h | 61 +- crypto_sign/dilithium2/aarch64/poly.c | 150 ++- crypto_sign/dilithium2/aarch64/polyvec.c | 19 +- crypto_sign/dilithium2/aarch64/polyvec.h | 3 + crypto_sign/dilithium2/aarch64/rounding.c | 27 +- crypto_sign/dilithium2/aarch64/sign.c | 28 +- crypto_sign/dilithium2/aarch64/sign.h | 3 +- .../dilithium2/aarch64/symmetric-shake.c | 5 +- crypto_sign/dilithium2/aarch64/symmetric.h | 6 +- crypto_sign/dilithium3/aarch64/LICENSE | 125 +-- crypto_sign/dilithium3/aarch64/NTT_params.h | 14 +- crypto_sign/dilithium3/aarch64/__asm_NTT.S | 571 +++++++----- crypto_sign/dilithium3/aarch64/__asm_iNTT.S | 485 +++++----- crypto_sign/dilithium3/aarch64/__asm_poly.S | 327 +++---- crypto_sign/dilithium3/aarch64/api.h | 8 +- crypto_sign/dilithium3/aarch64/feat.S | 6 +- crypto_sign/dilithium3/aarch64/fips202x2.c | 70 +- crypto_sign/dilithium3/aarch64/fips202x2.h | 10 +- crypto_sign/dilithium3/aarch64/macros.inc | 339 ++++++- .../dilithium3/aarch64/macros_common.inc | 852 ++++++++++++++++-- crypto_sign/dilithium3/aarch64/ntt.c | 23 +- crypto_sign/dilithium3/aarch64/ntt.h | 58 +- crypto_sign/dilithium3/aarch64/packing.c | 12 +- crypto_sign/dilithium3/aarch64/packing.h | 13 +- crypto_sign/dilithium3/aarch64/params.h | 61 +- crypto_sign/dilithium3/aarch64/poly.c | 199 +++- crypto_sign/dilithium3/aarch64/polyvec.c | 19 +- crypto_sign/dilithium3/aarch64/polyvec.h | 3 + crypto_sign/dilithium3/aarch64/rounding.c | 27 +- crypto_sign/dilithium3/aarch64/sign.c | 28 +- crypto_sign/dilithium3/aarch64/sign.h | 3 +- .../dilithium3/aarch64/symmetric-shake.c | 5 +- crypto_sign/dilithium3/aarch64/symmetric.h | 6 +- crypto_sign/dilithium5/aarch64/LICENSE | 125 +-- crypto_sign/dilithium5/aarch64/NTT_params.h | 14 +- crypto_sign/dilithium5/aarch64/__asm_NTT.S | 571 +++++++----- crypto_sign/dilithium5/aarch64/__asm_iNTT.S | 485 +++++----- crypto_sign/dilithium5/aarch64/__asm_poly.S | 327 +++---- crypto_sign/dilithium5/aarch64/api.h | 8 +- crypto_sign/dilithium5/aarch64/feat.S | 6 +- crypto_sign/dilithium5/aarch64/fips202x2.c | 70 +- crypto_sign/dilithium5/aarch64/fips202x2.h | 10 +- crypto_sign/dilithium5/aarch64/macros.inc | 339 ++++++- .../dilithium5/aarch64/macros_common.inc | 852 ++++++++++++++++-- crypto_sign/dilithium5/aarch64/ntt.c | 23 +- crypto_sign/dilithium5/aarch64/ntt.h | 58 +- crypto_sign/dilithium5/aarch64/packing.c | 12 +- crypto_sign/dilithium5/aarch64/packing.h | 13 +- crypto_sign/dilithium5/aarch64/params.h | 61 +- crypto_sign/dilithium5/aarch64/poly.c | 175 +++- crypto_sign/dilithium5/aarch64/polyvec.c | 19 +- crypto_sign/dilithium5/aarch64/polyvec.h | 3 + crypto_sign/dilithium5/aarch64/rounding.c | 27 +- crypto_sign/dilithium5/aarch64/sign.c | 28 +- crypto_sign/dilithium5/aarch64/sign.h | 3 +- .../dilithium5/aarch64/symmetric-shake.c | 5 +- crypto_sign/dilithium5/aarch64/symmetric.h | 6 +- 72 files changed, 6989 insertions(+), 2730 deletions(-) diff --git a/crypto_sign/dilithium2/aarch64/LICENSE b/crypto_sign/dilithium2/aarch64/LICENSE index 0e259d42..093b0a7d 100644 --- a/crypto_sign/dilithium2/aarch64/LICENSE +++ b/crypto_sign/dilithium2/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/crypto_sign/dilithium2/aarch64/NTT_params.h b/crypto_sign/dilithium2/aarch64/NTT_params.h index 582c16ed..dc261a2d 100644 --- a/crypto_sign/dilithium2/aarch64/NTT_params.h +++ b/crypto_sign/dilithium2/aarch64/NTT_params.h @@ -2,7 +2,9 @@ #define NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -42,14 +44,14 @@ #define invomegaQ1 731434 // R = 2^32 below // RmodQ1 = 2^32 mod^{+-} Q1 -#define RmodQ1 (-4186625) +#define RmodQ1 -4186625 // Q1prime = Q1^{-1} mod^{+-} 2^32 #define Q1prime 58728449 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 8347681 // invNQ1R2modQ1 = -NTT_N^{-1} 2^32 2^32 mod^{+-} Q1 below -#define invNQ1R2modQ1 (-41978) +#define invNQ1R2modQ1 -41978 // invNQ1R2modQ1_prime = invNQ1R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 #define invNQ1R2modQ1_prime 8395782 // invNQ1R2modQ1_prime_half = (invNQ1R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 @@ -60,11 +62,11 @@ // invNQ1_final_R2modQ1 = -invNQ1R2modQ1 invomegaQ1^{128} mod q #define invNQ1_final_R2modQ1 4404704 // invNQ1_final_R2modQ1_prime = invNQ1_final_R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime (-151046688) +#define invNQ1_final_R2modQ1_prime -151046688 // invNQ1_final_R2modQ1_prime_half = (invNQ1_final_R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime_half (-75523344) +#define invNQ1_final_R2modQ1_prime_half -75523344 // invNQ1_final_R2modQ1_doubleprime = (invNQ1_final_R2modQ1_prime Q1 - invNQ1_final_R2modQ1) / 2^32 -#define invNQ1_final_R2modQ1_doubleprime (-294725) +#define invNQ1_final_R2modQ1_doubleprime -294725 // RmodQ1_prime = -(RmodQ1 + Q1) Q1prime mod^{+-} 2^32 #define RmodQ1_prime 512 diff --git a/crypto_sign/dilithium2/aarch64/__asm_NTT.S b/crypto_sign/dilithium2/aarch64/__asm_NTT.S index bf7c70cf..ad121e6f 100644 --- a/crypto_sign/dilithium2/aarch64/__asm_NTT.S +++ b/crypto_sign/dilithium2/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,261 +30,413 @@ #include "macros.inc" -.align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top -PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top: -_PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top: - - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 - - ldr Q, [x2] - - mov table, x1 - - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 - - add src11, src0, #704 - add src12, src0, #768 +#include "params.h" - add src13, src0, #832 - add src14, src0, #896 +.align 2 +.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top +PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top: +_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top: - add src15, src0, #960 + push_simd + Q .req w8 + src .req x0 + counter .req x11 - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 + ldr Q, [x2] mov v20.S[0], Q - ld1 { v1.4S}, [ src1] - ld1 { v3.4S}, [ src3] - ld1 { v5.4S}, [ src5] - ld1 { v7.4S}, [ src7] - ld1 { v9.4S}, [ src9] - ld1 {v11.4S}, [src11] - ld1 {v13.4S}, [src13] - ld1 {v15.4S}, [src15] - - ld1 { v0.4S}, [ src0] - ld1 { v2.4S}, [ src2] - ld1 { v4.4S}, [ src4] - ld1 { v6.4S}, [ src6] - ld1 { v8.4S}, [ src8] - ld1 {v10.4S}, [src10] - ld1 {v12.4S}, [src12] - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + ldr q9, [src, #9*64] + ldr q11, [src, #11*64] + ldr q13, [src, #13*64] + ldr q15, [src, #15*64] + + qq_butterfly_topl \ + v9, v11, v13, v15, v16, v17, v18, v19, v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64 + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) mov counter, #3 _ntt_top_loop: - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mixssl \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + v9, v11, v13, v15, v16, v17, v18, v19, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64, \ + src, \ + q1, q3, q5, q7, \ + #(16+1*64), #(16+3*64), #(16+5*64), #(16+7*64) + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #(16+8*64), #(16+10*64), #(16+12*64), #(16+14*64), \ + src, \ + q0, q2, q4, q6, \ + #(16+0*64), #(16+2*64), #(16+4*64), #(16+6*64) + + add src, src, #16 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) sub counter, counter, #1 cbnz counter, _ntt_top_loop - st1 { v1.4S}, [ src1], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v7.4S}, [ src7], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v11.4S}, [src11], #16 - st1 {v13.4S}, [src13], #16 - st1 {v15.4S}, [src15], #16 - - st1 { v0.4S}, [ src0], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v8.4S}, [ src8], #16 - st1 {v10.4S}, [src10], #16 - st1 {v12.4S}, [src12], #16 - st1 {v14.4S}, [src14], #16 + qq_butterfly_botss \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter - pop_all + pop_simd - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot -PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot: -_PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot: - - push_all - Q .req w20 - src0 .req x0 - des0 .req x1 - src1 .req x2 - des1 .req x3 - table0 .req x28 - table1 .req x27 - counter .req x19 +.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot +PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot: +_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot: + + push_simd + Q .req w8 + src .req x0 + table0 .req x9 + table1 .req x10 + counter .req x11 ldr Q, [x2] add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + ldr q0, [src, #0*16] + ldr q1, [src, #1*16] + ldr q2, [src, #2*16] + ldr q3, [src, #3*16] + + ldr q4, [table0, #0*16] + ldr q5, [table0, #1*16] + ldr q20, [table1, #0*16] + ldr q21, [table1, #1*16] + + dq_butterfly_topl4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 + + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + - add des0, src0, #0 - add des1, src0, #512 + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - mov counter, #8 + str q0, [src, #0*16] + str q2, [src, #2*16] + + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 + + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + mov counter, #3 _ntt_bot_loop: - ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld1 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + trn_4x4_l4 v12, v13, v14, v15, v8, v9, v10, v11, src, q0, q1, q2, q3, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + str q12, [src, #0*16] + str q13, [src, #1*16] - mov v4.S[0], Q + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - dq_butterfly_top v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 - dq_butterfly_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + str q14, [src, #2*16] + str q15, [src, #3*16] + + + add src, src, #64 + + trn_4x4_l4 v28, v29, v30, v31, v24, v25, v26, v27, src, q16, q17, q18, q19, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q28, q29, q30, q31, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + add table0, table0, #128 + add table1, table1, #128 + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 - dq_butterfly_vec_top v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7 - dq_butterfly_vec_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 - dq_butterfly_vec_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) + + str q0, [src, #0*16] + str q2, [src, #2*16] + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 - st4 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st4 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - .unreq Q - .unreq src0 - .unreq des0 - .unreq src1 - .unreq des1 - .unreq table0 - .unreq table1 - .unreq counter - pop_all + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - br lr + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + trn_4x4_s4 v28, v29, v30, v31, v16, v17, v18, v19, src, q12, q13, q14, q15, #0*16, #1*16, #2*16, #3*16 + str q28, [src, #(512+0*16)] + str q29, [src, #(512+1*16)] + str q30, [src, #(512+2*16)] + str q31, [src, #(512+3*16)] + add src, src, #64 + .unreq Q + .unreq src + .unreq table0 + .unreq table1 + .unreq counter + pop_simd + ret diff --git a/crypto_sign/dilithium2/aarch64/__asm_iNTT.S b/crypto_sign/dilithium2/aarch64/__asm_iNTT.S index cb20745c..b37abaeb 100644 --- a/crypto_sign/dilithium2/aarch64/__asm_iNTT.S +++ b/crypto_sign/dilithium2/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top -PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: -_PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top +PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top: +_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top: push_all Q .req w20 @@ -41,23 +44,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: invNR2dp .req w25 invNWR2ph .req w26 invNWR2dp .req w27 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 counter .req x19 ldr Q, [x2, #0] @@ -69,77 +56,63 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: ldr invNWR2ph, [x2, #24] ldr invNWR2dp, [x2, #28] - mov table, x1 + ldr q20, [x1, #0*16] + ldr q21, [x1, #1*16] + ldr q22, [x1, #2*16] + ldr q23, [x1, #3*16] + ldr q24, [x1, #4*16] + ldr q25, [x1, #5*16] + ldr q26, [x1, #6*16] + ldr q27, [x1, #7*16] - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 + mov v20.S[0], Q - add src11, src0, #704 - add src12, src0, #768 + ldr q0, [src, # 0*64] + ldr q1, [src, # 1*64] - add src13, src0, #832 - add src14, src0, #896 + ldr q2, [src, # 2*64] + ldr q3, [src, # 3*64] - add src15, src0, #960 + ldr q4, [src, # 4*64] + ldr q5, [src, # 5*64] - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ldr q6, [src, # 6*64] + ldr q7, [src, # 7*64] - mov v20.S[0], Q + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 - ld1 { v0.4S}, [ src0] - ld1 { v1.4S}, [ src1] - ld1 { v2.4S}, [ src2] - ld1 { v3.4S}, [ src3] - ld1 { v4.4S}, [ src4] - ld1 { v5.4S}, [ src5] - ld1 { v6.4S}, [ src6] - ld1 { v7.4S}, [ src7] - - ld1 { v8.4S}, [ src8] - ld1 { v9.4S}, [ src9] - ld1 {v10.4S}, [src10] - ld1 {v11.4S}, [src11] - ld1 {v12.4S}, [src12] - ld1 {v13.4S}, [src13] - ld1 {v14.4S}, [src14] - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + mov counter, #3 + _intt_top_loop: + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -153,77 +126,99 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: sub v17.4S, v17.4S, v19.4S mla v0.4S, v16.4S, v29.4S - mla v1.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v2.4S + mla v1.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v3.4S + + str q0, [src, #0*64] cmge v16.4S, v2.4S, v30.4S + ldr q0, [src, #(16 + 0*64)] + str q1, [src, #1*64] cmge v17.4S, v3.4S, v30.4S + ldr q1, [src, #(16 + 1*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v2.4S, v16.4S, v29.4S - mla v3.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v4.4S + mla v3.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v5.4S + + str q2, [src, #2*64] cmge v16.4S, v4.4S, v30.4S + ldr q2, [src, #(16 + 2*64)] + str q3, [src, #3*64] cmge v17.4S, v5.4S, v30.4S + ldr q3, [src, #(16 + 3*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v4.4S, v16.4S, v29.4S - mla v5.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v6.4S + mla v5.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v7.4S + + str q4, [src, #4*64] cmge v16.4S, v6.4S, v30.4S + ldr q4, [src, #(16 + 4*64)] + str q5, [src, #5*64] cmge v17.4S, v7.4S, v30.4S + ldr q5, [src, #(16 + 5*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v6.4S, v16.4S, v29.4S - mla v7.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v8.4S + mla v7.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v9.4S + + str q6, [src, #6*64] cmge v16.4S, v8.4S, v30.4S + ldr q6, [src, #(16 + 6*64)] + str q7, [src, #7*64] cmge v17.4S, v9.4S, v30.4S + ldr q7, [src, #(16 + 7*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v8.4S, v16.4S, v29.4S - mla v9.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v10.4S + mla v9.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v11.4S + + str q8, [src, #8*64] cmge v16.4S, v10.4S, v30.4S + str q9, [src, #9*64] cmge v17.4S, v11.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v10.4S, v16.4S, v29.4S - mla v11.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v12.4S + mla v11.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v13.4S + + str q10, [src, #10*64] cmge v16.4S, v12.4S, v30.4S + str q11, [src, #11*64] cmge v17.4S, v13.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v12.4S, v16.4S, v29.4S - mla v13.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v14.4S + mla v13.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v15.4S + + str q12, [src, #12*64] cmge v16.4S, v14.4S, v30.4S + str q13, [src, #13*64] cmge v17.4S, v15.4S, v30.4S sub v16.4S, v16.4S, v18.4S @@ -232,66 +227,45 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - mov counter, #3 - _intt_top_loop: - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 + + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 + + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -307,6 +281,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v0.4S, v16.4S, v29.4S mla v1.4S, v17.4S, v29.4S + str q0, [src, #0*64] + str q1, [src, #1*64] + cmge v18.4S, v31.4S, v2.4S cmge v19.4S, v31.4S, v3.4S cmge v16.4S, v2.4S, v30.4S @@ -318,6 +295,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v2.4S, v16.4S, v29.4S mla v3.4S, v17.4S, v29.4S + str q2, [src, #2*64] + str q3, [src, #3*64] + cmge v18.4S, v31.4S, v4.4S cmge v19.4S, v31.4S, v5.4S cmge v16.4S, v4.4S, v30.4S @@ -329,6 +309,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v4.4S, v16.4S, v29.4S mla v5.4S, v17.4S, v29.4S + str q4, [src, #4*64] + str q5, [src, #5*64] + cmge v18.4S, v31.4S, v6.4S cmge v19.4S, v31.4S, v7.4S cmge v16.4S, v6.4S, v30.4S @@ -340,6 +323,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v6.4S, v16.4S, v29.4S mla v7.4S, v17.4S, v29.4S + str q6, [src, #6*64] + str q7, [src, #7*64] + cmge v18.4S, v31.4S, v8.4S cmge v19.4S, v31.4S, v9.4S cmge v16.4S, v8.4S, v30.4S @@ -351,6 +337,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v8.4S, v16.4S, v29.4S mla v9.4S, v17.4S, v29.4S + str q8, [src, #8*64] + str q9, [src, #9*64] + cmge v18.4S, v31.4S, v10.4S cmge v19.4S, v31.4S, v11.4S cmge v16.4S, v10.4S, v30.4S @@ -362,6 +351,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v10.4S, v16.4S, v29.4S mla v11.4S, v17.4S, v29.4S + str q10, [src, #10*64] + str q11, [src, #11*64] + cmge v18.4S, v31.4S, v12.4S cmge v19.4S, v31.4S, v13.4S cmge v16.4S, v12.4S, v30.4S @@ -373,6 +365,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v12.4S, v16.4S, v29.4S mla v13.4S, v17.4S, v29.4S + str q12, [src, #12*64] + str q13, [src, #13*64] + cmge v18.4S, v31.4S, v14.4S cmge v19.4S, v31.4S, v15.4S cmge v16.4S, v14.4S, v30.4S @@ -384,26 +379,11 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - sub counter, counter, #1 - cbnz counter, _intt_top_loop + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 - st1 { v0.4S}, [ src0], #16 - st1 { v1.4S}, [ src1], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v7.4S}, [ src7], #16 - - st1 { v8.4S}, [ src8], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v10.4S}, [src10], #16 - st1 {v11.4S}, [src11], #16 - st1 {v12.4S}, [src12], #16 - st1 {v13.4S}, [src13], #16 - st1 {v14.4S}, [src14], #16 - st1 {v15.4S}, [src15], #16 .unreq Q .unreq Qhalf @@ -412,41 +392,23 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: .unreq invNR2dp .unreq invNWR2ph .unreq invNWR2dp - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot -PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot: -_PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot +PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot: +_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot: push_all Q .req w20 RphRdp .req x21 src0 .req x0 - des0 .req x1 src1 .req x2 - des1 .req x3 table0 .req x28 table1 .req x27 counter .req x19 @@ -457,72 +419,175 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot: add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + add src1, src0, #512 - add des0, src0, #0 - add des1, src0, #512 + ldr q8, [table0, #4*16] + ldr q9, [table0, #5*16] + ldr q10, [table0, #6*16] + ldr q11, [table0, #7*16] - mov counter, #8 - _intt_bot_loop: + ldr q24, [table1, #4*16] + ldr q25, [table1, #5*16] + ldr q26, [table1, #6*16] + ldr q27, [table1, #7*16] + + ldr q0, [src0, # 0*16] + ldr q1, [src0, # 1*16] + + ldr q16, [src1, # 0*16] + ldr q17, [src1, # 1*16] - ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld4 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + ldr q2, [src0, # 2*16] + ldr q3, [src0, # 3*16] - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + ldr q18, [src1, # 2*16] + ldr q19, [src1, # 3*16] + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 mov v4.S[0], Q mov v20.D[0], RphRdp dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 - dq_butterfly_vec_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 - dq_butterfly_vec_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 - dq_butterfly_vec_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_top v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + mov counter, #7 + _intt_bot_loop: + + dq_butterfly_vec_top_ltrn_4x4 \ + v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + table0, \ + q8, q9, q10, q11, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16), \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q24, q25, q26, q27, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16) dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 - dq_butterfly_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + str q2, [src0, # 2*16] srshr v14.4S, v0.4S, #23 + ldr q2, [src0, #(64+ 2*16)] + str q3, [src0, # 3*16] srshr v15.4S, v1.4S, #23 + ldr q3, [src0, #(64+ 3*16)] + str q18, [src1, # 2*16] srshr v30.4S, v16.4S, #23 + ldr q18, [src1, #(64+ 2*16)] + str q19, [src1, # 3*16] srshr v31.4S, v17.4S, #23 + ldr q19, [src1, #(64+ 3*16)] mls v0.4S, v14.4S, v4.S[0] + str q0, [src0, # 0*16] + ldr q0, [src0, #(64+ 0*16)] mls v1.4S, v15.4S, v4.S[0] + str q1, [src0, # 1*16] + ldr q1, [src0, #(64+ 1*16)] mls v16.4S, v30.4S, v4.S[0] + str q16, [src1, # 0*16] + ldr q16, [src1, #(64+ 0*16)] mls v17.4S, v31.4S, v4.S[0] + str q17, [src1, # 1*16] + ldr q17, [src1, #(64+ 1*16)] + + add table0, table0, #128 + add table1, table1, #128 - st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st1 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 + + mov v4.S[0], Q + mov v20.D[0], RphRdp + + dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 sub counter, counter, #1 cbnz counter, _intt_bot_loop + dq_butterfly_vec_top_trn_4x4 \ + v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + + str q2, [src0, # 2*16] + str q3, [src0, # 3*16] + str q18, [src1, # 2*16] + str q19, [src1, # 3*16] + + srshr v14.4S, v0.4S, #23 + srshr v15.4S, v1.4S, #23 + srshr v30.4S, v16.4S, #23 + srshr v31.4S, v17.4S, #23 + + mls v0.4S, v14.4S, v4.S[0] + mls v1.4S, v15.4S, v4.S[0] + mls v16.4S, v30.4S, v4.S[0] + mls v17.4S, v31.4S, v4.S[0] + + str q0, [src0, # 0*16] + str q1, [src0, # 1*16] + str q16, [src1, # 0*16] + str q17, [src1, # 1*16] + + add table0, table0, #128 + add table1, table1, #128 + + add src0, src0, #64 + add src1, src1, #64 + .unreq Q .unreq RphRdp .unreq src0 - .unreq des0 .unreq src1 - .unreq des1 .unreq table0 .unreq table1 .unreq counter pop_all - br lr - - + ret diff --git a/crypto_sign/dilithium2/aarch64/__asm_poly.S b/crypto_sign/dilithium2/aarch64/__asm_poly.S index 24586662..e1225c59 100644 --- a/crypto_sign/dilithium2/aarch64/__asm_poly.S +++ b/crypto_sign/dilithium2/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,10 +32,10 @@ #include "params.h" .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32 -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32 -PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: -_PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32 +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32 +PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32: +_PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32: mov x7, #16 _10_to_32_loop: @@ -45,7 +48,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: str w4, [x0], #4 ubfx w5, w2, #20, #10 str w5, [x0], #4 - lsr w6, w2, #30 + lsr w6, w2, #30 ldr w2, [x1], #4 @@ -99,13 +102,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: sub x7, x7, #1 cbnz x7, _10_to_32_loop - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce: ldr w4, [x1] @@ -117,7 +120,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 - + ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 @@ -126,7 +129,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -135,7 +138,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -192,13 +195,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq: ldr w4, [x1] @@ -285,13 +288,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze: ldr w4, [x1] @@ -312,7 +315,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -321,7 +324,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S @@ -330,7 +333,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 - + sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 @@ -339,7 +342,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -414,13 +417,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round: mov w4, #1 @@ -560,13 +563,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round: st1 {v30.4S}, [x1], #16 st1 {v31.4S}, [x1], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -609,13 +612,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -632,7 +635,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: mov x16, #15 _poly_sub_loop: - + st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -658,13 +661,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl: add x1, x0, #0 @@ -725,13 +728,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl: st1 {v22.4S}, [x0], #16 st1 {v23.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery: push_all @@ -769,14 +772,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -819,14 +822,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -843,14 +846,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery -PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: -_PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery +PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery: +_PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery: push_all @@ -910,90 +913,90 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1027,14 +1030,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v27.4S, v23.4S, v31.4S ld1 { v7.4S}, [x2], #16 - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1064,90 +1067,90 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1173,14 +1176,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1194,7 +1197,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: pop_all - br lr + ret diff --git a/crypto_sign/dilithium2/aarch64/api.h b/crypto_sign/dilithium2/aarch64/api.h index c8dd59a7..254b49e1 100644 --- a/crypto_sign/dilithium2/aarch64/api.h +++ b/crypto_sign/dilithium2/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef PQCLEAN_DILITHIUM2_AARCH64_API_H -#define PQCLEAN_DILITHIUM2_AARCH64_API_H +#ifndef API_H +#define API_H /* * This file is dual licensed @@ -12,8 +12,8 @@ #define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES 1312 #define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES 2560 -#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES 2420 -#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_ALGNAME "Dilithium2" +#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES 2420 +#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_ALGNAME "Dilithium2" int PQCLEAN_DILITHIUM2_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); diff --git a/crypto_sign/dilithium2/aarch64/feat.S b/crypto_sign/dilithium2/aarch64/feat.S index 63be5df6..f467fa80 100644 --- a/crypto_sign/dilithium2/aarch64/feat.S +++ b/crypto_sign/dilithium2/aarch64/feat.S @@ -123,10 +123,8 @@ SOFTWARE. .endm .align 4 -.global PQCLEAN_DILITHIUM2_AARCH64_f1600x2 -.global _PQCLEAN_DILITHIUM2_AARCH64_f1600x2 -PQCLEAN_DILITHIUM2_AARCH64_f1600x2: -_PQCLEAN_DILITHIUM2_AARCH64_f1600x2: +.global _f1600x2 +_f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! diff --git a/crypto_sign/dilithium2/aarch64/fips202x2.c b/crypto_sign/dilithium2/aarch64/fips202x2.c index 2567f381..e045ee3d 100644 --- a/crypto_sign/dilithium2/aarch64/fips202x2.c +++ b/crypto_sign/dilithium2/aarch64/fips202x2.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -36,6 +37,11 @@ #include #include "fips202x2.h" +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + #define NROUNDS 24 // Define NEON operation @@ -47,20 +53,20 @@ #define vxor(c, a, b) c = veorq_u64(a, b); // Rotate by n bit ((a << offset) ^ (a >> (64-offset))) #define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); // Xor chain: out = a ^ b ^ c ^ d ^ e #define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); // Not And c = ~a & b // #define vbic(c, a, b) c = vbicq_u64(b, a); // Xor Not And: out = a ^ ( (~b) & c) #define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); // Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support #define vrxor(c, a, b) c = vrax1q_u64(a, b); // End Define @@ -100,11 +106,11 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { * * Arguments: - uint64_t *state: pointer to input/output Keccak state **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_f1600x2(v128 *, const uint64_t *); +extern void f1600x2(v128 *, const uint64_t *); static inline void KeccakF1600_StatePermutex2(v128 state[25]) { #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_DILITHIUM2_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); + f1600x2(state, neon_KeccakF_RoundConstants); #else v128 Aba, Abe, Abi, Abo, Abu; v128 Aga, Age, Agi, Ago, Agu; @@ -551,7 +557,14 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -570,7 +583,14 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -587,7 +607,14 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -606,7 +633,14 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -625,6 +659,9 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -644,6 +681,10 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -662,6 +703,9 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -681,4 +725,8 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } diff --git a/crypto_sign/dilithium2/aarch64/fips202x2.h b/crypto_sign/dilithium2/aarch64/fips202x2.h index 28babbc3..3066c52b 100644 --- a/crypto_sign/dilithium2/aarch64/fips202x2.h +++ b/crypto_sign/dilithium2/aarch64/fips202x2.h @@ -8,9 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" -#include #include +#include typedef uint64x2_t v128; @@ -23,31 +22,26 @@ typedef struct { v128 s[25]; } keccakx2_state; -#define shake128x2_absorb DILITHIUM_NAMESPACE(shake128x2_absorb) void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake128x2_squeezeblocks DILITHIUM_NAMESPACE(shake128x2_squeezeblocks) void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake256x2_absorb DILITHIUM_NAMESPACE(shake256x2_absorb) void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake256x2_squeezeblocks DILITHIUM_NAMESPACE(shake256x2_squeezeblocks) void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake128x2 DILITHIUM_NAMESPACE(shake128x2) void shake128x2(uint8_t *out0, uint8_t *out1, size_t outlen, @@ -55,11 +49,11 @@ void shake128x2(uint8_t *out0, const uint8_t *in1, size_t inlen); -#define shake256x2 DILITHIUM_NAMESPACE(shake256x2) void shake256x2(uint8_t *out0, uint8_t *out1, size_t outlen, const uint8_t *in0, const uint8_t *in1, size_t inlen); + #endif diff --git a/crypto_sign/dilithium2/aarch64/macros.inc b/crypto_sign/dilithium2/aarch64/macros.inc index ef3af4c5..5504405c 100644 --- a/crypto_sign/dilithium2/aarch64/macros.inc +++ b/crypto_sign/dilithium2/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,24 +30,254 @@ #include "macros_common.inc" -.macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - trn1 \t0\qS, \a0\qS, \a1\qS - trn2 \t1\qS, \a0\qS, \a1\qS - trn1 \t2\qS, \a2\qS, \a3\qS - trn2 \t3\qS, \a2\qS, \a3\qS + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S - trn1 \a0\dD, \t0\dD, \t2\dD - trn2 \a2\dD, \t0\dD, \t2\dD - trn1 \a1\dD, \t1\dD, \t3\dD - trn2 \a3\dD, \t1\dD, \t3\dD + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D .endm -.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - wrap_trn_4x4 \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, .4S, .2D + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + +.macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + +.macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 .endm +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + + +.macro do_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== .macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S @@ -54,12 +287,20 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 .endm -.macro dq_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S .endm @@ -67,16 +308,32 @@ wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + .macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 .endm -.macro dq_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm @@ -89,16 +346,48 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + .macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm -.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 .endm -.macro qq_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S .endm @@ -109,3 +398,5 @@ .macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S .endm + +// === 32-bit end ==== diff --git a/crypto_sign/dilithium2/aarch64/macros_common.inc b/crypto_sign/dilithium2/aarch64/macros_common.inc index bd7e77eb..07568491 100644 --- a/crypto_sign/dilithium2/aarch64/macros_common.inc +++ b/crypto_sign/dilithium2/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,35 +28,58 @@ * SOFTWARE. */ +#ifndef MACROS_COMMON +#define MACROS_COMMON + // for ABI .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -72,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -82,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -99,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -135,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -149,7 +310,134 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t4\wX, \b4\wX, \z4\nX[\h4] @@ -176,7 +464,186 @@ .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] + mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] + mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] + mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -218,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -228,15 +769,82 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] sub \b1\wX, \a1\wX, \t1\wX mul \t3\wX, \b3\wX, \h3\wX + ldr \c2, [\srcc_ptr, \memc2] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b3\wX, \b3\wX, \l3\wX @@ -245,7 +853,7 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX @@ -262,57 +870,98 @@ .endm +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + // vector-scalar Barrett reduction .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -391,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX @@ -448,3 +1191,10 @@ add \s3\wX, \a3\wX, \b3\wX .endm + + +#endif + + + + diff --git a/crypto_sign/dilithium2/aarch64/ntt.c b/crypto_sign/dilithium2/aarch64/ntt.c index 2d88c5d5..92d92313 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.c +++ b/crypto_sign/dilithium2/aarch64/ntt.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,13 +33,28 @@ */ #include "params.h" -#include "reduce.h" #include #include #include "NTT_params.h" #include "ntt.h" +const __attribute__ ((aligned (16)))int32_t constants[16] = { + Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, + invNQ1R2modQ1_prime_half, + invNQ1R2modQ1_doubleprime, + invNQ1_final_R2modQ1_prime_half, + invNQ1_final_R2modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { +0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { +0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 +}; + /************************************************* * Name: ntt * diff --git a/crypto_sign/dilithium2/aarch64/ntt.h b/crypto_sign/dilithium2/aarch64/ntt.h index 86796ca8..b26b6479 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.h +++ b/crypto_sign/dilithium2/aarch64/ntt.h @@ -6,12 +6,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,45 +34,39 @@ * SOFTWARE. */ -#include "NTT_params.h" -#include "params.h" #include +#include "params.h" +#include "NTT_params.h" + +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants); +extern +const int32_t constants[16]; -#define NTT(in) { \ - PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define iNTT(in) { \ - PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; + +#define NTT(in) do { \ + PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + } while(0) #define ntt DILITHIUM_NAMESPACE(ntt) void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); -static const int constants[16] = { - Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, - invNQ1R2modQ1_prime_half, - invNQ1R2modQ1_doubleprime, - invNQ1_final_R2modQ1_prime_half, - invNQ1_final_R2modQ1_doubleprime -}; - -static const int streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 0, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 3073009, 327391679, 1277625, -675340520, -2635473, 987079667, 3852015, 449207, 1753, -495951789, -1935420, -681503850, -2659525, -373072124, -1455890, 681730119, 2660408, -456183549, -1780227, -15156688, -59148, 710479343, 2772600, 0, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, 4183372, -825844983, -3222807, -799869667, -3121440, -70227934, -274060, 302950022, 1182243, 22347069, 87208, 163212680, 636927, -1016110510, -3965306, -1013916752, -3956745, -588452222, -2296397, -841760171, -3284915, -952468207, -3716946, 0, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 2508980, 519705671, 2028118, 496502727, 1937570, -977780347, -3815725, -7126831, -27812, 210776307, 822541, 258649997, 1009365, -628875181, -2454145, -507246529, -1979497, 409185979, 1596822, -1013967746, -3956944, -963363710, -3759465, 0, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, 2811291, -764594519, -2983781, -284313712, -1109516, 1065510939, 4158088, -431820817, -1685153, -873958779, -3410568, 686309310, 2678278, -965793731, -3768948, -909946047, -3551006, 162963861, 635956, -64176841, -250446, -629190881, -2455377, 0, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 1528066, 123678909, 482649, 294395108, 1148858, -759080783, -2962264, -1062481036, -4146264, -454226054, -1772588, 561940831, 2192938, -442566669, -1727088, 611800717, 2387513, -925511710, -3611750, -68791907, -268456, -814992530, -3180456, 0, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, -565603, 43482586, 169688, 631001801, 2462444, -854436357, -3334383, 960233614, 3747250, 588375860, 2296099, 317727459, 1239911, -983611064, -3838479, 818892658, 3195676, 677264190, 2642980, 321386456, 1254190, -3181859, -12417, 0, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -4166425, -893898890, -3488383, 509377762, 1987814, -819295484, -3197248, 768294260, 2998219, 36345249, 141835, -22883400, -89301, 643961400, 2513018, -347191365, -1354892, 157142369, 613238, -335754661, -1310261, -568482643, -2218467, 0, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 1736313, 60323094, 235407, -832852657, -3250154, 834980303, 3258457, -117552223, -458740, -492511373, -1921994, 1035301089, 4040196, -889718424, -3472069, 522531086, 2039144, -481719139, -1879878, -209807681, -818761, -558360247, -2178965, 0, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, -2579253, 458160776, 1787943, -612717067, -2391089, -577774276, -2254727, -415984810, -1623354, 539479988, 2105286, -608441020, -2374402, -521163479, -2033807, 150224382, 586241, -302276083, -1179613, 135295244, 527981, -702999655, -2743411, 0, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, 3482206, -1071872863, -4182915, -333129378, -1300016, -605279149, -2362063, -378477722, -1476985, 510974714, 1994046, 638402564, 2491325, -356997292, -1393159, 130156402, 507927, -304395785, -1187885, -185731180, -724804, -470097680, -1834526, 0, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, -1317678, 630730945, 2461387, 777970524, 3035980, 159173408, 621164, -777397036, -3033742, -86720197, -338420, 678549029, 2647994, 771248568, 3009748, -669544140, -2612853, 1063046068, 4148469, 192079267, 749577, -1030830548, -4022750, 0, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, 3901472, -314332144, -1226661, 749740976, 2925816, 864652284, 3374250, 1020029345, 3980599, 658309618, 2569011, -413979908, -1615530, 441577800, 1723229, 426738094, 1665318, 519685171, 2028038, 298172236, 1163598, -863376927, -3369273, 0, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, 1356448, -711287812, -2775755, 687588511, 2683270, -712065019, -2778788, 1023635298, 3994671, -3043996, -11879, -351195274, -1370517, 773976352, 3020393, 861908357, 3363542, 55063046, 214880, 139752717, 545376, -197425671, -770441, 0, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, -3467665, 592665232, 2312838, -167401858, -653275, -117660617, -459163, 795799901, 3105558, -282732136, -1103344, 130212265, 508145, -141890356, -553718, 220412084, 860144, 879049958, 3430436, 35937555, 140244, -388001774, -1514152, 0, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, 348812, -84011120, -327848, 259126110, 1011223, -603268097, -2354215, -559928242, -2185084, 800464680, 3123762, 604333585, 2358373, -561979013, -2193087, -772445769, -3014420, -439933955, -1716814, 749801963, 2926054, -100631253, -392707, 0, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -3818627, -492577742, -1922253, -573161516, -2236726, 447030292, 1744507, -77645096, -303005, 904878186, 3531229, -1018462631, -3974485, -967019376, -3773731, 486888731, 1900052, -200355636, -781875, 270210213, 1054478, -187430119, -731434, 0, 0 -}; - -static const int streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 0, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, -1744507, 573161516, 2236726, 492577742, 1922253, 978523985, 3818627, 187430119, 731434, -270210213, -1054478, 200355636, 781875, -486888731, -1900052, 967019376, 3773731, 1018462631, 3974485, -904878186, -3531229, 77645096, 303005, 0, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, 2354215, -259126110, -1011223, 84011120, 327848, -89383150, -348812, 100631253, 392707, -749801963, -2926054, 439933955, 1716814, 772445769, 3014420, 561979013, 2193087, -604333585, -2358373, -800464680, -3123762, 559928242, 2185084, 0, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 459163, 167401858, 653275, -592665232, -2312838, 888589898, 3467665, 388001774, 1514152, -35937555, -140244, -879049958, -3430436, -220412084, -860144, 141890356, 553718, -130212265, -508145, 282732136, 1103344, -795799901, -3105558, 0, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, 2778788, -687588511, -2683270, 711287812, 2775755, -347590090, -1356448, 197425671, 770441, -139752717, -545376, -55063046, -214880, -861908357, -3363542, -773976352, -3020393, 351195274, 1370517, 3043996, 11879, -1023635298, -3994671, 0, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -3374250, -749740976, -2925816, 314332144, 1226661, -999753034, -3901472, 863376927, 3369273, -298172236, -1163598, -519685171, -2028038, -426738094, -1665318, -441577800, -1723229, 413979908, 1615530, -658309618, -2569011, -1020029345, -3980599, 0, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -621164, -777970524, -3035980, -630730945, -2461387, 337655269, 1317678, 1030830548, 4022750, -192079267, -749577, -1063046068, -4148469, 669544140, 2612853, -771248568, -3009748, -678549029, -2647994, 86720197, 338420, 777397036, 3033742, 0, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 2362063, 333129378, 1300016, 1071872863, 4182915, -892316032, -3482206, 470097680, 1834526, 185731180, 724804, 304395785, 1187885, -130156402, -507927, 356997292, 1393159, -638402564, -2491325, -510974714, -1994046, 378477722, 1476985, 0, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 2254727, 612717067, 2391089, -458160776, -1787943, 660934133, 2579253, 702999655, 2743411, -135295244, -527981, 302276083, 1179613, -150224382, -586241, 521163479, 2033807, 608441020, 2374402, -539479988, -2105286, 415984810, 1623354, 0, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, -3258457, 832852657, 3250154, -60323094, -235407, -444930577, -1736313, 558360247, 2178965, 209807681, 818761, 481719139, 1879878, -522531086, -2039144, 889718424, 3472069, -1035301089, -4040196, 492511373, 1921994, 117552223, 458740, 0, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, 3197248, -509377762, -1987814, 893898890, 3488383, 1067647297, 4166425, 568482643, 2218467, 335754661, 1310261, -157142369, -613238, 347191365, 1354892, -643961400, -2513018, 22883400, 89301, -36345249, -141835, -768294260, -2998219, 0, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, 3334383, -631001801, -2462444, -43482586, -169688, 144935890, 565603, 3181859, 12417, -321386456, -1254190, -677264190, -2642980, -818892658, -3195676, 983611064, 3838479, -317727459, -1239911, -588375860, -2296099, -960233614, -3747250, 0, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, 2962264, -294395108, -1148858, -123678909, -482649, -391567239, -1528066, 814992530, 3180456, 68791907, 268456, 925511710, 3611750, -611800717, -2387513, 442566669, 1727088, -561940831, -2192938, 454226054, 1772588, 1062481036, 4146264, 0, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, -4158088, 284313712, 1109516, 764594519, 2983781, -720393920, -2811291, 629190881, 2455377, 64176841, 250446, -162963861, -635956, 909946047, 3551006, 965793731, 3768948, -686309310, -2678278, 873958779, 3410568, 431820817, 1685153, 0, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, 3815725, -496502727, -1937570, -519705671, -2028118, -642926661, -2508980, 963363710, 3759465, 1013967746, 3956944, -409185979, -1596822, 507246529, 1979497, 628875181, 2454145, -258649997, -1009365, -210776307, -822541, 7126831, 27812, 0, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 274060, 799869667, 3121440, 825844983, 3222807, -1071989969, -4183372, 952468207, 3716946, 841760171, 3284915, 588452222, 2296397, 1013916752, 3956745, 1016110510, 3965306, -163212680, -636927, -22347069, -87208, -302950022, -1182243, 0, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, -3852015, 675340520, 2635473, -327391679, -1277625, -787459213, -3073009, -710479343, -2772600, 15156688, 59148, 456183549, 1780227, -681730119, -2660408, 373072124, 1455890, 681503850, 2659525, 495951789, 1935420, -449207, -1753, 0, 0 -}; #endif diff --git a/crypto_sign/dilithium2/aarch64/packing.c b/crypto_sign/dilithium2/aarch64/packing.c index 8fa3b0cc..1d46d7a6 100644 --- a/crypto_sign/dilithium2/aarch64/packing.c +++ b/crypto_sign/dilithium2/aarch64/packing.c @@ -19,7 +19,7 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1) { unsigned int i; @@ -45,7 +45,7 @@ void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], **************************************************/ void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) { + const uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -71,7 +71,7 @@ void unpack_pk(uint8_t rho[SEEDBYTES], * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 **************************************************/ -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -129,7 +129,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]) { + const uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -172,7 +172,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h **************************************************/ -void pack_sig(uint8_t sig[CRYPTO_BYTES], +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h) { @@ -221,7 +221,7 @@ void pack_sig(uint8_t sig[CRYPTO_BYTES], int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, - const uint8_t sig[CRYPTO_BYTES]) { + const uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES]) { unsigned int i, j, k; for (i = 0; i < CTILDEBYTES; ++i) { diff --git a/crypto_sign/dilithium2/aarch64/packing.h b/crypto_sign/dilithium2/aarch64/packing.h index fb70ce5d..9021a864 100644 --- a/crypto_sign/dilithium2/aarch64/packing.h +++ b/crypto_sign/dilithium2/aarch64/packing.h @@ -7,15 +7,16 @@ * or public domain at https://github.com/pq-crystals/dilithium */ +#include "api.h" #include "params.h" #include "polyvec.h" #include #define pack_pk DILITHIUM_NAMESPACE(pack_pk) -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); #define pack_sk DILITHIUM_NAMESPACE(pack_sk) -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -24,10 +25,10 @@ void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], const polyveck *s2); #define pack_sig DILITHIUM_NAMESPACE(pack_sig) -void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); #define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) -void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES]); #define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) void unpack_sk(uint8_t rho[SEEDBYTES], @@ -36,9 +37,9 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + const uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES]); #define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) -int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES]); #endif diff --git a/crypto_sign/dilithium2/aarch64/params.h b/crypto_sign/dilithium2/aarch64/params.h index 7601e765..ced52a7c 100644 --- a/crypto_sign/dilithium2/aarch64/params.h +++ b/crypto_sign/dilithium2/aarch64/params.h @@ -11,8 +11,8 @@ //#define DILITHIUM_MODE 3 //#define DILITHIUM_MODE 5 -#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM2_AARCH64_crypto_sign #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM2_AARCH64_##s +#define CRYPTO_NAMESPACETOP crypto_sign #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s) @@ -25,6 +25,8 @@ #define D 13 #define ROOT_OF_UNITY 1753 +#if DILITHIUM_MODE == 2 + #define K 4 #define L 4 #define ETA 2 @@ -35,23 +37,66 @@ #define OMEGA 80 #define CRYPTO_ALGNAME "Dilithium2" #define CTILDEBYTES 32 +#elif DILITHIUM_MODE == 3 + +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((DILITHIUM_Q-1)/32) +#define OMEGA 55 +#define CRYPTO_ALGNAME "Dilithium3" +#define CTILDEBYTES 48 +#elif DILITHIUM_MODE == 5 + +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((DILITHIUM_Q-1)/32) +#define OMEGA 75 +#define CRYPTO_ALGNAME "Dilithium5" +#define CTILDEBYTES 64 +#else + +#error "No parameter specified!" + +#endif + + #define POLYT1_PACKEDBYTES 320 #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) +#if GAMMA1 == (1 << 17) #define POLYZ_PACKEDBYTES 576 +#elif GAMMA1 == (1 << 19) +#define POLYZ_PACKEDBYTES 640 +#endif +#if GAMMA2 == (DILITHIUM_Q-1)/88 #define POLYW1_PACKEDBYTES 192 +#elif GAMMA2 == (DILITHIUM_Q-1)/32 +#define POLYW1_PACKEDBYTES 128 +#endif +#if ETA == 2 #define POLYETA_PACKEDBYTES 96 +#elif ETA == 4 +#define POLYETA_PACKEDBYTES 128 +#endif -#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) -#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ - + TRBYTES \ - + L*POLYETA_PACKEDBYTES \ - + K*POLYETA_PACKEDBYTES \ - + K*POLYT0_PACKEDBYTES) -#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) +#define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define DILITHIUM_CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/crypto_sign/dilithium2/aarch64/poly.c b/crypto_sign/dilithium2/aarch64/poly.c index d2c371ba..62721a38 100644 --- a/crypto_sign/dilithium2/aarch64/poly.c +++ b/crypto_sign/dilithium2/aarch64/poly.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,13 +41,8 @@ #include "fips202x2.h" -#include "NTT_params.h" #include "ntt.h" -static const int32_t montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; - #define DBENCH_START() #define DBENCH_STOP(t) @@ -57,11 +54,11 @@ static const int32_t montgomery_const[4] = { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(int32_t *, const int32_t *); void poly_reduce(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -74,11 +71,11 @@ void poly_reduce(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(int32_t *, const int32_t *); void poly_caddq(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -91,11 +88,11 @@ void poly_caddq(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(int32_t *, const int32_t *); void poly_freeze(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -205,11 +202,11 @@ void poly_invntt_tomont(poly *a) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, constants); DBENCH_STOP(*tmul); } @@ -226,11 +223,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { * - poly *a0: pointer to output polynomial with coefficients c0 * - const poly *a: pointer to input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *); void poly_power2round(poly *a1, poly *a0, const poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); DBENCH_STOP(*tround); } @@ -470,6 +467,8 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; + #if ETA == 2 + if (t0 < 15) { t0 = t0 - (205 * t0 >> 10) * 5; a[ctr++] = 2 - t0; @@ -479,6 +478,21 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 2 - t1; } + #elif ETA == 4 + + if (t0 < 9) { + a[ctr++] = 4 - t0; + } + if (t1 < 9 && ctr < len) { + a[ctr++] = 4 - t1; + } + + #else + +#error "No parameter specified!" + + #endif + } DBENCH_STOP(*tsample); @@ -496,7 +510,11 @@ static unsigned int rej_eta(int32_t *a, * - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ +#if ETA == 2 #define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#elif ETA == 4 +#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#endif void poly_uniform_eta(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { @@ -643,6 +661,8 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); + #if ETA == 2 + for (i = 0; i < N / 8; ++i) { t[0] = ETA - a->coeffs[8 * i + 0]; t[1] = ETA - a->coeffs[8 * i + 1]; @@ -658,6 +678,20 @@ void polyeta_pack(uint8_t *r, const poly *a) { r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } + #elif ETA == 4 + + for (i = 0; i < N / 2; ++i) { + t[0] = ETA - a->coeffs[2 * i + 0]; + t[1] = ETA - a->coeffs[2 * i + 1]; + r[i] = t[0] | (t[1] << 4); + } + + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -673,6 +707,8 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); + #if ETA == 2 + for (i = 0; i < N / 8; ++i) { r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; @@ -693,6 +729,21 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } + #elif ETA == 4 + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[i] & 0x0F; + r->coeffs[2 * i + 1] = a[i] >> 4; + r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; + } + + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -730,11 +781,11 @@ void polyt1_pack(uint8_t *r, const poly *a) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32(int32_t *, const uint8_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(int32_t *, const uint8_t *); void polyt1_unpack(poly *r, const uint8_t *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32(r->coeffs, a); + PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(r->coeffs, a); DBENCH_STOP(*tpack); } @@ -865,6 +916,8 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); + #if GAMMA1 == (1 << 17) + for (i = 0; i < N / 4; ++i) { t[0] = GAMMA1 - a->coeffs[4 * i + 0]; t[1] = GAMMA1 - a->coeffs[4 * i + 1]; @@ -885,6 +938,26 @@ void polyz_pack(uint8_t *r, const poly *a) { r[9 * i + 8] = t[3] >> 10; } + #elif GAMMA1 == (1 << 19) + + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; + + r[5 * i + 0] = t[0]; + r[5 * i + 1] = t[0] >> 8; + r[5 * i + 2] = t[0] >> 16; + r[5 * i + 2] |= t[1] << 4; + r[5 * i + 3] = t[1] >> 4; + r[5 * i + 4] = t[1] >> 12; + } + + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -901,6 +974,8 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); + #if GAMMA1 == (1 << 17) + for (i = 0; i < N / 4; ++i) { r->coeffs[4 * i + 0] = a[9 * i + 0]; r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; @@ -928,6 +1003,29 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; } + #elif GAMMA1 == (1 << 19) + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[5 * i + 0]; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; + } + + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -945,6 +1043,8 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); + #if GAMMA2 == (DILITHIUM_Q-1)/88 + for (i = 0; i < N / 4; ++i) { r[3 * i + 0] = a->coeffs[4 * i + 0]; r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6; @@ -954,5 +1054,17 @@ void polyw1_pack(uint8_t *r, const poly *a) { r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2; } + #elif GAMMA2 == (DILITHIUM_Q-1)/32 + + for (i = 0; i < N / 2; ++i) { + r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); + } + + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium2/aarch64/polyvec.c b/crypto_sign/dilithium2/aarch64/polyvec.c index 2018807b..b03dd1fa 100644 --- a/crypto_sign/dilithium2/aarch64/polyvec.c +++ b/crypto_sign/dilithium2/aarch64/polyvec.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,17 +32,14 @@ * SOFTWARE. */ +#include + #include "params.h" #include "poly.h" #include "polyvec.h" -#include - +#include "ntt.h" #include "reduce.h" -static const int32_t l_montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; - /************************************************* * Name: expand_mat * @@ -177,11 +176,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { - PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, constants); } /************************************************* diff --git a/crypto_sign/dilithium2/aarch64/polyvec.h b/crypto_sign/dilithium2/aarch64/polyvec.h index dc3377c9..8844ca79 100644 --- a/crypto_sign/dilithium2/aarch64/polyvec.h +++ b/crypto_sign/dilithium2/aarch64/polyvec.h @@ -42,9 +42,12 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v); + #define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) int polyvecl_chknorm(const polyvecl *v, int32_t B); + + /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; diff --git a/crypto_sign/dilithium2/aarch64/rounding.c b/crypto_sign/dilithium2/aarch64/rounding.c index f5efb266..30c97510 100644 --- a/crypto_sign/dilithium2/aarch64/rounding.c +++ b/crypto_sign/dilithium2/aarch64/rounding.c @@ -47,10 +47,22 @@ int32_t decompose(int32_t *a0, int32_t a) { int32_t a1; a1 = (a + 127) >> 7; + #if GAMMA2 == (DILITHIUM_Q-1)/32 + + a1 = (a1 * 1025 + (1 << 21)) >> 22; + a1 &= 15; + + #elif GAMMA2 == (DILITHIUM_Q-1)/88 a1 = (a1 * 11275 + (1 << 23)) >> 24; a1 ^= ((43 - a1) >> 31) & a1; + #else + +#error "No parameter specified" + + #endif + *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; return a1; @@ -93,9 +105,22 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } + #if GAMMA2 == (DILITHIUM_Q-1)/32 + + if (a0 > 0) { + return (a1 + 1) & 15; + } else { + return (a1 - 1) & 15; + } + + #elif GAMMA2 == (DILITHIUM_Q-1)/88 + if (a0 > 0) { return (a1 == 43) ? 0 : a1 + 1; + } else { + return (a1 == 0) ? 43 : a1 - 1; } - return (a1 == 0) ? 43 : a1 - 1; + + #endif } diff --git a/crypto_sign/dilithium2/aarch64/sign.c b/crypto_sign/dilithium2/aarch64/sign.c index 3565b370..ad2fd833 100644 --- a/crypto_sign/dilithium2/aarch64/sign.c +++ b/crypto_sign/dilithium2/aarch64/sign.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -90,7 +91,7 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { pack_pk(pk, rho, &t1); /* Compute H(rho, t1) and write secret key */ - shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(tr, TRBYTES, pk, PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES); pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; @@ -139,7 +140,8 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - for (n = 0; n < RNDBYTES; n++) { + + for(n = 0; n < RNDBYTES; n++) { rnd[n] = 0; } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); @@ -210,7 +212,7 @@ int crypto_sign_signature(uint8_t *sig, /* Write signature */ pack_sig(sig, sig, &z, &h); - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES; return 0; } @@ -238,9 +240,9 @@ int crypto_sign(uint8_t *sm, size_t i; for (i = 0; i < mlen; ++i) { - sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + sm[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; } - crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES, mlen, sk); *smlen += mlen; return 0; } @@ -274,7 +276,7 @@ int crypto_sign_verify(const uint8_t *sig, polyveck t1, w1, h; shake256incctx state; - if (siglen != CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES) { return -1; } @@ -287,7 +289,7 @@ int crypto_sign_verify(const uint8_t *sig, } /* Compute CRH(H(rho, t1), msg) */ - shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(mu, CRHBYTES, pk, PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES); shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -353,17 +355,17 @@ int crypto_sign_open(uint8_t *m, const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; - if (crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES; + if (crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES + i]; } return 0; } diff --git a/crypto_sign/dilithium2/aarch64/sign.h b/crypto_sign/dilithium2/aarch64/sign.h index bc8c4265..05e7b5f6 100644 --- a/crypto_sign/dilithium2/aarch64/sign.h +++ b/crypto_sign/dilithium2/aarch64/sign.h @@ -13,6 +13,7 @@ #include #include + #define challenge DILITHIUM_NAMESPACE(challenge) void challenge(poly *c, const uint8_t seed[SEEDBYTES]); @@ -24,7 +25,7 @@ int crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); -#define crypto_sign DILITHIUM_NAMESPACETOP +#define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk); diff --git a/crypto_sign/dilithium2/aarch64/symmetric-shake.c b/crypto_sign/dilithium2/aarch64/symmetric-shake.c index a53074aa..53aab1c9 100644 --- a/crypto_sign/dilithium2/aarch64/symmetric-shake.c +++ b/crypto_sign/dilithium2/aarch64/symmetric-shake.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/crypto_sign/dilithium2/aarch64/symmetric.h b/crypto_sign/dilithium2/aarch64/symmetric.h index 40b928ec..74d21021 100644 --- a/crypto_sign/dilithium2/aarch64/symmetric.h +++ b/crypto_sign/dilithium2/aarch64/symmetric.h @@ -6,8 +6,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -59,6 +60,7 @@ void dilithium_shake256x2_stream_init(keccakx2_state *state, const uint8_t seed[CRHBYTES], uint16_t nonce1, uint16_t nonce2); + #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE diff --git a/crypto_sign/dilithium3/aarch64/LICENSE b/crypto_sign/dilithium3/aarch64/LICENSE index 0e259d42..093b0a7d 100644 --- a/crypto_sign/dilithium3/aarch64/LICENSE +++ b/crypto_sign/dilithium3/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/crypto_sign/dilithium3/aarch64/NTT_params.h b/crypto_sign/dilithium3/aarch64/NTT_params.h index 582c16ed..dc261a2d 100644 --- a/crypto_sign/dilithium3/aarch64/NTT_params.h +++ b/crypto_sign/dilithium3/aarch64/NTT_params.h @@ -2,7 +2,9 @@ #define NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -42,14 +44,14 @@ #define invomegaQ1 731434 // R = 2^32 below // RmodQ1 = 2^32 mod^{+-} Q1 -#define RmodQ1 (-4186625) +#define RmodQ1 -4186625 // Q1prime = Q1^{-1} mod^{+-} 2^32 #define Q1prime 58728449 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 8347681 // invNQ1R2modQ1 = -NTT_N^{-1} 2^32 2^32 mod^{+-} Q1 below -#define invNQ1R2modQ1 (-41978) +#define invNQ1R2modQ1 -41978 // invNQ1R2modQ1_prime = invNQ1R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 #define invNQ1R2modQ1_prime 8395782 // invNQ1R2modQ1_prime_half = (invNQ1R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 @@ -60,11 +62,11 @@ // invNQ1_final_R2modQ1 = -invNQ1R2modQ1 invomegaQ1^{128} mod q #define invNQ1_final_R2modQ1 4404704 // invNQ1_final_R2modQ1_prime = invNQ1_final_R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime (-151046688) +#define invNQ1_final_R2modQ1_prime -151046688 // invNQ1_final_R2modQ1_prime_half = (invNQ1_final_R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime_half (-75523344) +#define invNQ1_final_R2modQ1_prime_half -75523344 // invNQ1_final_R2modQ1_doubleprime = (invNQ1_final_R2modQ1_prime Q1 - invNQ1_final_R2modQ1) / 2^32 -#define invNQ1_final_R2modQ1_doubleprime (-294725) +#define invNQ1_final_R2modQ1_doubleprime -294725 // RmodQ1_prime = -(RmodQ1 + Q1) Q1prime mod^{+-} 2^32 #define RmodQ1_prime 512 diff --git a/crypto_sign/dilithium3/aarch64/__asm_NTT.S b/crypto_sign/dilithium3/aarch64/__asm_NTT.S index 0c7732d2..fad81730 100644 --- a/crypto_sign/dilithium3/aarch64/__asm_NTT.S +++ b/crypto_sign/dilithium3/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,261 +30,413 @@ #include "macros.inc" -.align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top -PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top: -_PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top: - - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 - - ldr Q, [x2] - - mov table, x1 - - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 - - add src11, src0, #704 - add src12, src0, #768 +#include "params.h" - add src13, src0, #832 - add src14, src0, #896 +.align 2 +.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top +PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top: +_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top: - add src15, src0, #960 + push_simd + Q .req w8 + src .req x0 + counter .req x11 - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 + ldr Q, [x2] mov v20.S[0], Q - ld1 { v1.4S}, [ src1] - ld1 { v3.4S}, [ src3] - ld1 { v5.4S}, [ src5] - ld1 { v7.4S}, [ src7] - ld1 { v9.4S}, [ src9] - ld1 {v11.4S}, [src11] - ld1 {v13.4S}, [src13] - ld1 {v15.4S}, [src15] - - ld1 { v0.4S}, [ src0] - ld1 { v2.4S}, [ src2] - ld1 { v4.4S}, [ src4] - ld1 { v6.4S}, [ src6] - ld1 { v8.4S}, [ src8] - ld1 {v10.4S}, [src10] - ld1 {v12.4S}, [src12] - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + ldr q9, [src, #9*64] + ldr q11, [src, #11*64] + ldr q13, [src, #13*64] + ldr q15, [src, #15*64] + + qq_butterfly_topl \ + v9, v11, v13, v15, v16, v17, v18, v19, v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64 + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) mov counter, #3 _ntt_top_loop: - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mixssl \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + v9, v11, v13, v15, v16, v17, v18, v19, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64, \ + src, \ + q1, q3, q5, q7, \ + #(16+1*64), #(16+3*64), #(16+5*64), #(16+7*64) + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #(16+8*64), #(16+10*64), #(16+12*64), #(16+14*64), \ + src, \ + q0, q2, q4, q6, \ + #(16+0*64), #(16+2*64), #(16+4*64), #(16+6*64) + + add src, src, #16 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) sub counter, counter, #1 cbnz counter, _ntt_top_loop - st1 { v1.4S}, [ src1], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v7.4S}, [ src7], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v11.4S}, [src11], #16 - st1 {v13.4S}, [src13], #16 - st1 {v15.4S}, [src15], #16 - - st1 { v0.4S}, [ src0], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v8.4S}, [ src8], #16 - st1 {v10.4S}, [src10], #16 - st1 {v12.4S}, [src12], #16 - st1 {v14.4S}, [src14], #16 + qq_butterfly_botss \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter - pop_all + pop_simd - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot -PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot: -_PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot: - - push_all - Q .req w20 - src0 .req x0 - des0 .req x1 - src1 .req x2 - des1 .req x3 - table0 .req x28 - table1 .req x27 - counter .req x19 +.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot +PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot: +_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot: + + push_simd + Q .req w8 + src .req x0 + table0 .req x9 + table1 .req x10 + counter .req x11 ldr Q, [x2] add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + ldr q0, [src, #0*16] + ldr q1, [src, #1*16] + ldr q2, [src, #2*16] + ldr q3, [src, #3*16] + + ldr q4, [table0, #0*16] + ldr q5, [table0, #1*16] + ldr q20, [table1, #0*16] + ldr q21, [table1, #1*16] + + dq_butterfly_topl4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 + + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + - add des0, src0, #0 - add des1, src0, #512 + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - mov counter, #8 + str q0, [src, #0*16] + str q2, [src, #2*16] + + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 + + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + mov counter, #3 _ntt_bot_loop: - ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld1 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + trn_4x4_l4 v12, v13, v14, v15, v8, v9, v10, v11, src, q0, q1, q2, q3, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + str q12, [src, #0*16] + str q13, [src, #1*16] - mov v4.S[0], Q + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - dq_butterfly_top v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 - dq_butterfly_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + str q14, [src, #2*16] + str q15, [src, #3*16] + + + add src, src, #64 + + trn_4x4_l4 v28, v29, v30, v31, v24, v25, v26, v27, src, q16, q17, q18, q19, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q28, q29, q30, q31, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + add table0, table0, #128 + add table1, table1, #128 + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 - dq_butterfly_vec_top v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7 - dq_butterfly_vec_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 - dq_butterfly_vec_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) + + str q0, [src, #0*16] + str q2, [src, #2*16] + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 - st4 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st4 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - .unreq Q - .unreq src0 - .unreq des0 - .unreq src1 - .unreq des1 - .unreq table0 - .unreq table1 - .unreq counter - pop_all + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - br lr + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + trn_4x4_s4 v28, v29, v30, v31, v16, v17, v18, v19, src, q12, q13, q14, q15, #0*16, #1*16, #2*16, #3*16 + str q28, [src, #(512+0*16)] + str q29, [src, #(512+1*16)] + str q30, [src, #(512+2*16)] + str q31, [src, #(512+3*16)] + add src, src, #64 + .unreq Q + .unreq src + .unreq table0 + .unreq table1 + .unreq counter + pop_simd + ret diff --git a/crypto_sign/dilithium3/aarch64/__asm_iNTT.S b/crypto_sign/dilithium3/aarch64/__asm_iNTT.S index 7c05e2ec..49dbca85 100644 --- a/crypto_sign/dilithium3/aarch64/__asm_iNTT.S +++ b/crypto_sign/dilithium3/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top -PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: -_PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top +PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top: +_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top: push_all Q .req w20 @@ -41,23 +44,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: invNR2dp .req w25 invNWR2ph .req w26 invNWR2dp .req w27 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 counter .req x19 ldr Q, [x2, #0] @@ -69,77 +56,63 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: ldr invNWR2ph, [x2, #24] ldr invNWR2dp, [x2, #28] - mov table, x1 + ldr q20, [x1, #0*16] + ldr q21, [x1, #1*16] + ldr q22, [x1, #2*16] + ldr q23, [x1, #3*16] + ldr q24, [x1, #4*16] + ldr q25, [x1, #5*16] + ldr q26, [x1, #6*16] + ldr q27, [x1, #7*16] - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 + mov v20.S[0], Q - add src11, src0, #704 - add src12, src0, #768 + ldr q0, [src, # 0*64] + ldr q1, [src, # 1*64] - add src13, src0, #832 - add src14, src0, #896 + ldr q2, [src, # 2*64] + ldr q3, [src, # 3*64] - add src15, src0, #960 + ldr q4, [src, # 4*64] + ldr q5, [src, # 5*64] - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ldr q6, [src, # 6*64] + ldr q7, [src, # 7*64] - mov v20.S[0], Q + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 - ld1 { v0.4S}, [ src0] - ld1 { v1.4S}, [ src1] - ld1 { v2.4S}, [ src2] - ld1 { v3.4S}, [ src3] - ld1 { v4.4S}, [ src4] - ld1 { v5.4S}, [ src5] - ld1 { v6.4S}, [ src6] - ld1 { v7.4S}, [ src7] - - ld1 { v8.4S}, [ src8] - ld1 { v9.4S}, [ src9] - ld1 {v10.4S}, [src10] - ld1 {v11.4S}, [src11] - ld1 {v12.4S}, [src12] - ld1 {v13.4S}, [src13] - ld1 {v14.4S}, [src14] - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + mov counter, #3 + _intt_top_loop: + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -153,77 +126,99 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: sub v17.4S, v17.4S, v19.4S mla v0.4S, v16.4S, v29.4S - mla v1.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v2.4S + mla v1.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v3.4S + + str q0, [src, #0*64] cmge v16.4S, v2.4S, v30.4S + ldr q0, [src, #(16 + 0*64)] + str q1, [src, #1*64] cmge v17.4S, v3.4S, v30.4S + ldr q1, [src, #(16 + 1*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v2.4S, v16.4S, v29.4S - mla v3.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v4.4S + mla v3.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v5.4S + + str q2, [src, #2*64] cmge v16.4S, v4.4S, v30.4S + ldr q2, [src, #(16 + 2*64)] + str q3, [src, #3*64] cmge v17.4S, v5.4S, v30.4S + ldr q3, [src, #(16 + 3*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v4.4S, v16.4S, v29.4S - mla v5.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v6.4S + mla v5.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v7.4S + + str q4, [src, #4*64] cmge v16.4S, v6.4S, v30.4S + ldr q4, [src, #(16 + 4*64)] + str q5, [src, #5*64] cmge v17.4S, v7.4S, v30.4S + ldr q5, [src, #(16 + 5*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v6.4S, v16.4S, v29.4S - mla v7.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v8.4S + mla v7.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v9.4S + + str q6, [src, #6*64] cmge v16.4S, v8.4S, v30.4S + ldr q6, [src, #(16 + 6*64)] + str q7, [src, #7*64] cmge v17.4S, v9.4S, v30.4S + ldr q7, [src, #(16 + 7*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v8.4S, v16.4S, v29.4S - mla v9.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v10.4S + mla v9.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v11.4S + + str q8, [src, #8*64] cmge v16.4S, v10.4S, v30.4S + str q9, [src, #9*64] cmge v17.4S, v11.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v10.4S, v16.4S, v29.4S - mla v11.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v12.4S + mla v11.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v13.4S + + str q10, [src, #10*64] cmge v16.4S, v12.4S, v30.4S + str q11, [src, #11*64] cmge v17.4S, v13.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v12.4S, v16.4S, v29.4S - mla v13.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v14.4S + mla v13.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v15.4S + + str q12, [src, #12*64] cmge v16.4S, v14.4S, v30.4S + str q13, [src, #13*64] cmge v17.4S, v15.4S, v30.4S sub v16.4S, v16.4S, v18.4S @@ -232,66 +227,45 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - mov counter, #3 - _intt_top_loop: - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 + + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 + + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -307,6 +281,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v0.4S, v16.4S, v29.4S mla v1.4S, v17.4S, v29.4S + str q0, [src, #0*64] + str q1, [src, #1*64] + cmge v18.4S, v31.4S, v2.4S cmge v19.4S, v31.4S, v3.4S cmge v16.4S, v2.4S, v30.4S @@ -318,6 +295,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v2.4S, v16.4S, v29.4S mla v3.4S, v17.4S, v29.4S + str q2, [src, #2*64] + str q3, [src, #3*64] + cmge v18.4S, v31.4S, v4.4S cmge v19.4S, v31.4S, v5.4S cmge v16.4S, v4.4S, v30.4S @@ -329,6 +309,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v4.4S, v16.4S, v29.4S mla v5.4S, v17.4S, v29.4S + str q4, [src, #4*64] + str q5, [src, #5*64] + cmge v18.4S, v31.4S, v6.4S cmge v19.4S, v31.4S, v7.4S cmge v16.4S, v6.4S, v30.4S @@ -340,6 +323,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v6.4S, v16.4S, v29.4S mla v7.4S, v17.4S, v29.4S + str q6, [src, #6*64] + str q7, [src, #7*64] + cmge v18.4S, v31.4S, v8.4S cmge v19.4S, v31.4S, v9.4S cmge v16.4S, v8.4S, v30.4S @@ -351,6 +337,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v8.4S, v16.4S, v29.4S mla v9.4S, v17.4S, v29.4S + str q8, [src, #8*64] + str q9, [src, #9*64] + cmge v18.4S, v31.4S, v10.4S cmge v19.4S, v31.4S, v11.4S cmge v16.4S, v10.4S, v30.4S @@ -362,6 +351,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v10.4S, v16.4S, v29.4S mla v11.4S, v17.4S, v29.4S + str q10, [src, #10*64] + str q11, [src, #11*64] + cmge v18.4S, v31.4S, v12.4S cmge v19.4S, v31.4S, v13.4S cmge v16.4S, v12.4S, v30.4S @@ -373,6 +365,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v12.4S, v16.4S, v29.4S mla v13.4S, v17.4S, v29.4S + str q12, [src, #12*64] + str q13, [src, #13*64] + cmge v18.4S, v31.4S, v14.4S cmge v19.4S, v31.4S, v15.4S cmge v16.4S, v14.4S, v30.4S @@ -384,26 +379,11 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - sub counter, counter, #1 - cbnz counter, _intt_top_loop + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 - st1 { v0.4S}, [ src0], #16 - st1 { v1.4S}, [ src1], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v7.4S}, [ src7], #16 - - st1 { v8.4S}, [ src8], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v10.4S}, [src10], #16 - st1 {v11.4S}, [src11], #16 - st1 {v12.4S}, [src12], #16 - st1 {v13.4S}, [src13], #16 - st1 {v14.4S}, [src14], #16 - st1 {v15.4S}, [src15], #16 .unreq Q .unreq Qhalf @@ -412,41 +392,23 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: .unreq invNR2dp .unreq invNWR2ph .unreq invNWR2dp - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot -PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot: -_PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot +PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot: +_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot: push_all Q .req w20 RphRdp .req x21 src0 .req x0 - des0 .req x1 src1 .req x2 - des1 .req x3 table0 .req x28 table1 .req x27 counter .req x19 @@ -457,72 +419,175 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot: add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + add src1, src0, #512 - add des0, src0, #0 - add des1, src0, #512 + ldr q8, [table0, #4*16] + ldr q9, [table0, #5*16] + ldr q10, [table0, #6*16] + ldr q11, [table0, #7*16] - mov counter, #8 - _intt_bot_loop: + ldr q24, [table1, #4*16] + ldr q25, [table1, #5*16] + ldr q26, [table1, #6*16] + ldr q27, [table1, #7*16] + + ldr q0, [src0, # 0*16] + ldr q1, [src0, # 1*16] + + ldr q16, [src1, # 0*16] + ldr q17, [src1, # 1*16] - ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld4 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + ldr q2, [src0, # 2*16] + ldr q3, [src0, # 3*16] - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + ldr q18, [src1, # 2*16] + ldr q19, [src1, # 3*16] + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 mov v4.S[0], Q mov v20.D[0], RphRdp dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 - dq_butterfly_vec_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 - dq_butterfly_vec_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 - dq_butterfly_vec_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_top v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + mov counter, #7 + _intt_bot_loop: + + dq_butterfly_vec_top_ltrn_4x4 \ + v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + table0, \ + q8, q9, q10, q11, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16), \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q24, q25, q26, q27, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16) dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 - dq_butterfly_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + str q2, [src0, # 2*16] srshr v14.4S, v0.4S, #23 + ldr q2, [src0, #(64+ 2*16)] + str q3, [src0, # 3*16] srshr v15.4S, v1.4S, #23 + ldr q3, [src0, #(64+ 3*16)] + str q18, [src1, # 2*16] srshr v30.4S, v16.4S, #23 + ldr q18, [src1, #(64+ 2*16)] + str q19, [src1, # 3*16] srshr v31.4S, v17.4S, #23 + ldr q19, [src1, #(64+ 3*16)] mls v0.4S, v14.4S, v4.S[0] + str q0, [src0, # 0*16] + ldr q0, [src0, #(64+ 0*16)] mls v1.4S, v15.4S, v4.S[0] + str q1, [src0, # 1*16] + ldr q1, [src0, #(64+ 1*16)] mls v16.4S, v30.4S, v4.S[0] + str q16, [src1, # 0*16] + ldr q16, [src1, #(64+ 0*16)] mls v17.4S, v31.4S, v4.S[0] + str q17, [src1, # 1*16] + ldr q17, [src1, #(64+ 1*16)] + + add table0, table0, #128 + add table1, table1, #128 - st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st1 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 + + mov v4.S[0], Q + mov v20.D[0], RphRdp + + dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 sub counter, counter, #1 cbnz counter, _intt_bot_loop + dq_butterfly_vec_top_trn_4x4 \ + v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + + str q2, [src0, # 2*16] + str q3, [src0, # 3*16] + str q18, [src1, # 2*16] + str q19, [src1, # 3*16] + + srshr v14.4S, v0.4S, #23 + srshr v15.4S, v1.4S, #23 + srshr v30.4S, v16.4S, #23 + srshr v31.4S, v17.4S, #23 + + mls v0.4S, v14.4S, v4.S[0] + mls v1.4S, v15.4S, v4.S[0] + mls v16.4S, v30.4S, v4.S[0] + mls v17.4S, v31.4S, v4.S[0] + + str q0, [src0, # 0*16] + str q1, [src0, # 1*16] + str q16, [src1, # 0*16] + str q17, [src1, # 1*16] + + add table0, table0, #128 + add table1, table1, #128 + + add src0, src0, #64 + add src1, src1, #64 + .unreq Q .unreq RphRdp .unreq src0 - .unreq des0 .unreq src1 - .unreq des1 .unreq table0 .unreq table1 .unreq counter pop_all - br lr - - + ret diff --git a/crypto_sign/dilithium3/aarch64/__asm_poly.S b/crypto_sign/dilithium3/aarch64/__asm_poly.S index 25b73945..a8c9568f 100644 --- a/crypto_sign/dilithium3/aarch64/__asm_poly.S +++ b/crypto_sign/dilithium3/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,10 +32,10 @@ #include "params.h" .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32 -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32 -PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: -_PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32 +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32 +PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32: +_PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32: mov x7, #16 _10_to_32_loop: @@ -45,7 +48,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: str w4, [x0], #4 ubfx w5, w2, #20, #10 str w5, [x0], #4 - lsr w6, w2, #30 + lsr w6, w2, #30 ldr w2, [x1], #4 @@ -99,13 +102,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: sub x7, x7, #1 cbnz x7, _10_to_32_loop - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce: ldr w4, [x1] @@ -117,7 +120,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 - + ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 @@ -126,7 +129,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -135,7 +138,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -192,13 +195,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq: ldr w4, [x1] @@ -285,13 +288,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze: ldr w4, [x1] @@ -312,7 +315,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -321,7 +324,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S @@ -330,7 +333,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 - + sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 @@ -339,7 +342,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -414,13 +417,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round: mov w4, #1 @@ -560,13 +563,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: st1 {v30.4S}, [x1], #16 st1 {v31.4S}, [x1], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -609,13 +612,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -632,7 +635,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: mov x16, #15 _poly_sub_loop: - + st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -658,13 +661,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl: add x1, x0, #0 @@ -725,13 +728,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: st1 {v22.4S}, [x0], #16 st1 {v23.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery: push_all @@ -769,14 +772,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -819,14 +822,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -843,14 +846,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery -PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: -_PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery +PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery: +_PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery: push_all @@ -910,90 +913,90 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1027,14 +1030,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v27.4S, v23.4S, v31.4S ld1 { v7.4S}, [x2], #16 - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1064,90 +1067,90 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1173,14 +1176,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1194,7 +1197,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: pop_all - br lr + ret diff --git a/crypto_sign/dilithium3/aarch64/api.h b/crypto_sign/dilithium3/aarch64/api.h index 635fe1fb..6f11665e 100644 --- a/crypto_sign/dilithium3/aarch64/api.h +++ b/crypto_sign/dilithium3/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef PQCLEAN_DILITHIUM3_AARCH64_API_H -#define PQCLEAN_DILITHIUM3_AARCH64_API_H +#ifndef API_H +#define API_H /* * This file is dual licensed @@ -12,8 +12,8 @@ #define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES 1952 #define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES 4032 -#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES 3309 -#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_ALGNAME "Dilithium3" +#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES 3309 +#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_ALGNAME "Dilithium3" int PQCLEAN_DILITHIUM3_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); diff --git a/crypto_sign/dilithium3/aarch64/feat.S b/crypto_sign/dilithium3/aarch64/feat.S index 358adf61..f467fa80 100644 --- a/crypto_sign/dilithium3/aarch64/feat.S +++ b/crypto_sign/dilithium3/aarch64/feat.S @@ -123,10 +123,8 @@ SOFTWARE. .endm .align 4 -.global PQCLEAN_DILITHIUM3_AARCH64_f1600x2 -.global _PQCLEAN_DILITHIUM3_AARCH64_f1600x2 -PQCLEAN_DILITHIUM3_AARCH64_f1600x2: -_PQCLEAN_DILITHIUM3_AARCH64_f1600x2: +.global _f1600x2 +_f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! diff --git a/crypto_sign/dilithium3/aarch64/fips202x2.c b/crypto_sign/dilithium3/aarch64/fips202x2.c index 6e3e5d45..e045ee3d 100644 --- a/crypto_sign/dilithium3/aarch64/fips202x2.c +++ b/crypto_sign/dilithium3/aarch64/fips202x2.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -36,6 +37,11 @@ #include #include "fips202x2.h" +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + #define NROUNDS 24 // Define NEON operation @@ -47,20 +53,20 @@ #define vxor(c, a, b) c = veorq_u64(a, b); // Rotate by n bit ((a << offset) ^ (a >> (64-offset))) #define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); // Xor chain: out = a ^ b ^ c ^ d ^ e #define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); // Not And c = ~a & b // #define vbic(c, a, b) c = vbicq_u64(b, a); // Xor Not And: out = a ^ ( (~b) & c) #define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); // Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support #define vrxor(c, a, b) c = vrax1q_u64(a, b); // End Define @@ -100,11 +106,11 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { * * Arguments: - uint64_t *state: pointer to input/output Keccak state **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_f1600x2(v128 *, const uint64_t *); +extern void f1600x2(v128 *, const uint64_t *); static inline void KeccakF1600_StatePermutex2(v128 state[25]) { #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_DILITHIUM3_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); + f1600x2(state, neon_KeccakF_RoundConstants); #else v128 Aba, Abe, Abi, Abo, Abu; v128 Aga, Age, Agi, Ago, Agu; @@ -551,7 +557,14 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -570,7 +583,14 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -587,7 +607,14 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -606,7 +633,14 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -625,6 +659,9 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -644,6 +681,10 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -662,6 +703,9 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -681,4 +725,8 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } diff --git a/crypto_sign/dilithium3/aarch64/fips202x2.h b/crypto_sign/dilithium3/aarch64/fips202x2.h index 28babbc3..3066c52b 100644 --- a/crypto_sign/dilithium3/aarch64/fips202x2.h +++ b/crypto_sign/dilithium3/aarch64/fips202x2.h @@ -8,9 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" -#include #include +#include typedef uint64x2_t v128; @@ -23,31 +22,26 @@ typedef struct { v128 s[25]; } keccakx2_state; -#define shake128x2_absorb DILITHIUM_NAMESPACE(shake128x2_absorb) void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake128x2_squeezeblocks DILITHIUM_NAMESPACE(shake128x2_squeezeblocks) void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake256x2_absorb DILITHIUM_NAMESPACE(shake256x2_absorb) void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake256x2_squeezeblocks DILITHIUM_NAMESPACE(shake256x2_squeezeblocks) void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake128x2 DILITHIUM_NAMESPACE(shake128x2) void shake128x2(uint8_t *out0, uint8_t *out1, size_t outlen, @@ -55,11 +49,11 @@ void shake128x2(uint8_t *out0, const uint8_t *in1, size_t inlen); -#define shake256x2 DILITHIUM_NAMESPACE(shake256x2) void shake256x2(uint8_t *out0, uint8_t *out1, size_t outlen, const uint8_t *in0, const uint8_t *in1, size_t inlen); + #endif diff --git a/crypto_sign/dilithium3/aarch64/macros.inc b/crypto_sign/dilithium3/aarch64/macros.inc index ef3af4c5..5504405c 100644 --- a/crypto_sign/dilithium3/aarch64/macros.inc +++ b/crypto_sign/dilithium3/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,24 +30,254 @@ #include "macros_common.inc" -.macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - trn1 \t0\qS, \a0\qS, \a1\qS - trn2 \t1\qS, \a0\qS, \a1\qS - trn1 \t2\qS, \a2\qS, \a3\qS - trn2 \t3\qS, \a2\qS, \a3\qS + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S - trn1 \a0\dD, \t0\dD, \t2\dD - trn2 \a2\dD, \t0\dD, \t2\dD - trn1 \a1\dD, \t1\dD, \t3\dD - trn2 \a3\dD, \t1\dD, \t3\dD + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D .endm -.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - wrap_trn_4x4 \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, .4S, .2D + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + +.macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + +.macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 .endm +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + + +.macro do_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== .macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S @@ -54,12 +287,20 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 .endm -.macro dq_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S .endm @@ -67,16 +308,32 @@ wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + .macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 .endm -.macro dq_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm @@ -89,16 +346,48 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + .macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm -.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 .endm -.macro qq_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S .endm @@ -109,3 +398,5 @@ .macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S .endm + +// === 32-bit end ==== diff --git a/crypto_sign/dilithium3/aarch64/macros_common.inc b/crypto_sign/dilithium3/aarch64/macros_common.inc index bd7e77eb..07568491 100644 --- a/crypto_sign/dilithium3/aarch64/macros_common.inc +++ b/crypto_sign/dilithium3/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,35 +28,58 @@ * SOFTWARE. */ +#ifndef MACROS_COMMON +#define MACROS_COMMON + // for ABI .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -72,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -82,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -99,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -135,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -149,7 +310,134 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t4\wX, \b4\wX, \z4\nX[\h4] @@ -176,7 +464,186 @@ .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] + mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] + mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] + mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -218,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -228,15 +769,82 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] sub \b1\wX, \a1\wX, \t1\wX mul \t3\wX, \b3\wX, \h3\wX + ldr \c2, [\srcc_ptr, \memc2] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b3\wX, \b3\wX, \l3\wX @@ -245,7 +853,7 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX @@ -262,57 +870,98 @@ .endm +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + // vector-scalar Barrett reduction .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -391,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX @@ -448,3 +1191,10 @@ add \s3\wX, \a3\wX, \b3\wX .endm + + +#endif + + + + diff --git a/crypto_sign/dilithium3/aarch64/ntt.c b/crypto_sign/dilithium3/aarch64/ntt.c index 2d88c5d5..92d92313 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.c +++ b/crypto_sign/dilithium3/aarch64/ntt.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,13 +33,28 @@ */ #include "params.h" -#include "reduce.h" #include #include #include "NTT_params.h" #include "ntt.h" +const __attribute__ ((aligned (16)))int32_t constants[16] = { + Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, + invNQ1R2modQ1_prime_half, + invNQ1R2modQ1_doubleprime, + invNQ1_final_R2modQ1_prime_half, + invNQ1_final_R2modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { +0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { +0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 +}; + /************************************************* * Name: ntt * diff --git a/crypto_sign/dilithium3/aarch64/ntt.h b/crypto_sign/dilithium3/aarch64/ntt.h index 0fdd0040..ef8dd217 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.h +++ b/crypto_sign/dilithium3/aarch64/ntt.h @@ -6,12 +6,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,45 +34,39 @@ * SOFTWARE. */ -#include "NTT_params.h" -#include "params.h" #include +#include "params.h" +#include "NTT_params.h" + +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants); +extern +const int32_t constants[16]; -#define NTT(in) { \ - PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define iNTT(in) { \ - PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; + +#define NTT(in) do { \ + PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + } while(0) #define ntt DILITHIUM_NAMESPACE(ntt) void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); -static const int constants[16] = { - Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, - invNQ1R2modQ1_prime_half, - invNQ1R2modQ1_doubleprime, - invNQ1_final_R2modQ1_prime_half, - invNQ1_final_R2modQ1_doubleprime -}; - -static const int streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 0, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 3073009, 327391679, 1277625, -675340520, -2635473, 987079667, 3852015, 449207, 1753, -495951789, -1935420, -681503850, -2659525, -373072124, -1455890, 681730119, 2660408, -456183549, -1780227, -15156688, -59148, 710479343, 2772600, 0, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, 4183372, -825844983, -3222807, -799869667, -3121440, -70227934, -274060, 302950022, 1182243, 22347069, 87208, 163212680, 636927, -1016110510, -3965306, -1013916752, -3956745, -588452222, -2296397, -841760171, -3284915, -952468207, -3716946, 0, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 2508980, 519705671, 2028118, 496502727, 1937570, -977780347, -3815725, -7126831, -27812, 210776307, 822541, 258649997, 1009365, -628875181, -2454145, -507246529, -1979497, 409185979, 1596822, -1013967746, -3956944, -963363710, -3759465, 0, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, 2811291, -764594519, -2983781, -284313712, -1109516, 1065510939, 4158088, -431820817, -1685153, -873958779, -3410568, 686309310, 2678278, -965793731, -3768948, -909946047, -3551006, 162963861, 635956, -64176841, -250446, -629190881, -2455377, 0, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 1528066, 123678909, 482649, 294395108, 1148858, -759080783, -2962264, -1062481036, -4146264, -454226054, -1772588, 561940831, 2192938, -442566669, -1727088, 611800717, 2387513, -925511710, -3611750, -68791907, -268456, -814992530, -3180456, 0, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, -565603, 43482586, 169688, 631001801, 2462444, -854436357, -3334383, 960233614, 3747250, 588375860, 2296099, 317727459, 1239911, -983611064, -3838479, 818892658, 3195676, 677264190, 2642980, 321386456, 1254190, -3181859, -12417, 0, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -4166425, -893898890, -3488383, 509377762, 1987814, -819295484, -3197248, 768294260, 2998219, 36345249, 141835, -22883400, -89301, 643961400, 2513018, -347191365, -1354892, 157142369, 613238, -335754661, -1310261, -568482643, -2218467, 0, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 1736313, 60323094, 235407, -832852657, -3250154, 834980303, 3258457, -117552223, -458740, -492511373, -1921994, 1035301089, 4040196, -889718424, -3472069, 522531086, 2039144, -481719139, -1879878, -209807681, -818761, -558360247, -2178965, 0, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, -2579253, 458160776, 1787943, -612717067, -2391089, -577774276, -2254727, -415984810, -1623354, 539479988, 2105286, -608441020, -2374402, -521163479, -2033807, 150224382, 586241, -302276083, -1179613, 135295244, 527981, -702999655, -2743411, 0, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, 3482206, -1071872863, -4182915, -333129378, -1300016, -605279149, -2362063, -378477722, -1476985, 510974714, 1994046, 638402564, 2491325, -356997292, -1393159, 130156402, 507927, -304395785, -1187885, -185731180, -724804, -470097680, -1834526, 0, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, -1317678, 630730945, 2461387, 777970524, 3035980, 159173408, 621164, -777397036, -3033742, -86720197, -338420, 678549029, 2647994, 771248568, 3009748, -669544140, -2612853, 1063046068, 4148469, 192079267, 749577, -1030830548, -4022750, 0, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, 3901472, -314332144, -1226661, 749740976, 2925816, 864652284, 3374250, 1020029345, 3980599, 658309618, 2569011, -413979908, -1615530, 441577800, 1723229, 426738094, 1665318, 519685171, 2028038, 298172236, 1163598, -863376927, -3369273, 0, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, 1356448, -711287812, -2775755, 687588511, 2683270, -712065019, -2778788, 1023635298, 3994671, -3043996, -11879, -351195274, -1370517, 773976352, 3020393, 861908357, 3363542, 55063046, 214880, 139752717, 545376, -197425671, -770441, 0, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, -3467665, 592665232, 2312838, -167401858, -653275, -117660617, -459163, 795799901, 3105558, -282732136, -1103344, 130212265, 508145, -141890356, -553718, 220412084, 860144, 879049958, 3430436, 35937555, 140244, -388001774, -1514152, 0, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, 348812, -84011120, -327848, 259126110, 1011223, -603268097, -2354215, -559928242, -2185084, 800464680, 3123762, 604333585, 2358373, -561979013, -2193087, -772445769, -3014420, -439933955, -1716814, 749801963, 2926054, -100631253, -392707, 0, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -3818627, -492577742, -1922253, -573161516, -2236726, 447030292, 1744507, -77645096, -303005, 904878186, 3531229, -1018462631, -3974485, -967019376, -3773731, 486888731, 1900052, -200355636, -781875, 270210213, 1054478, -187430119, -731434, 0, 0 -}; - -static const int streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 0, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, -1744507, 573161516, 2236726, 492577742, 1922253, 978523985, 3818627, 187430119, 731434, -270210213, -1054478, 200355636, 781875, -486888731, -1900052, 967019376, 3773731, 1018462631, 3974485, -904878186, -3531229, 77645096, 303005, 0, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, 2354215, -259126110, -1011223, 84011120, 327848, -89383150, -348812, 100631253, 392707, -749801963, -2926054, 439933955, 1716814, 772445769, 3014420, 561979013, 2193087, -604333585, -2358373, -800464680, -3123762, 559928242, 2185084, 0, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 459163, 167401858, 653275, -592665232, -2312838, 888589898, 3467665, 388001774, 1514152, -35937555, -140244, -879049958, -3430436, -220412084, -860144, 141890356, 553718, -130212265, -508145, 282732136, 1103344, -795799901, -3105558, 0, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, 2778788, -687588511, -2683270, 711287812, 2775755, -347590090, -1356448, 197425671, 770441, -139752717, -545376, -55063046, -214880, -861908357, -3363542, -773976352, -3020393, 351195274, 1370517, 3043996, 11879, -1023635298, -3994671, 0, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -3374250, -749740976, -2925816, 314332144, 1226661, -999753034, -3901472, 863376927, 3369273, -298172236, -1163598, -519685171, -2028038, -426738094, -1665318, -441577800, -1723229, 413979908, 1615530, -658309618, -2569011, -1020029345, -3980599, 0, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -621164, -777970524, -3035980, -630730945, -2461387, 337655269, 1317678, 1030830548, 4022750, -192079267, -749577, -1063046068, -4148469, 669544140, 2612853, -771248568, -3009748, -678549029, -2647994, 86720197, 338420, 777397036, 3033742, 0, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 2362063, 333129378, 1300016, 1071872863, 4182915, -892316032, -3482206, 470097680, 1834526, 185731180, 724804, 304395785, 1187885, -130156402, -507927, 356997292, 1393159, -638402564, -2491325, -510974714, -1994046, 378477722, 1476985, 0, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 2254727, 612717067, 2391089, -458160776, -1787943, 660934133, 2579253, 702999655, 2743411, -135295244, -527981, 302276083, 1179613, -150224382, -586241, 521163479, 2033807, 608441020, 2374402, -539479988, -2105286, 415984810, 1623354, 0, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, -3258457, 832852657, 3250154, -60323094, -235407, -444930577, -1736313, 558360247, 2178965, 209807681, 818761, 481719139, 1879878, -522531086, -2039144, 889718424, 3472069, -1035301089, -4040196, 492511373, 1921994, 117552223, 458740, 0, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, 3197248, -509377762, -1987814, 893898890, 3488383, 1067647297, 4166425, 568482643, 2218467, 335754661, 1310261, -157142369, -613238, 347191365, 1354892, -643961400, -2513018, 22883400, 89301, -36345249, -141835, -768294260, -2998219, 0, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, 3334383, -631001801, -2462444, -43482586, -169688, 144935890, 565603, 3181859, 12417, -321386456, -1254190, -677264190, -2642980, -818892658, -3195676, 983611064, 3838479, -317727459, -1239911, -588375860, -2296099, -960233614, -3747250, 0, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, 2962264, -294395108, -1148858, -123678909, -482649, -391567239, -1528066, 814992530, 3180456, 68791907, 268456, 925511710, 3611750, -611800717, -2387513, 442566669, 1727088, -561940831, -2192938, 454226054, 1772588, 1062481036, 4146264, 0, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, -4158088, 284313712, 1109516, 764594519, 2983781, -720393920, -2811291, 629190881, 2455377, 64176841, 250446, -162963861, -635956, 909946047, 3551006, 965793731, 3768948, -686309310, -2678278, 873958779, 3410568, 431820817, 1685153, 0, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, 3815725, -496502727, -1937570, -519705671, -2028118, -642926661, -2508980, 963363710, 3759465, 1013967746, 3956944, -409185979, -1596822, 507246529, 1979497, 628875181, 2454145, -258649997, -1009365, -210776307, -822541, 7126831, 27812, 0, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 274060, 799869667, 3121440, 825844983, 3222807, -1071989969, -4183372, 952468207, 3716946, 841760171, 3284915, 588452222, 2296397, 1013916752, 3956745, 1016110510, 3965306, -163212680, -636927, -22347069, -87208, -302950022, -1182243, 0, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, -3852015, 675340520, 2635473, -327391679, -1277625, -787459213, -3073009, -710479343, -2772600, 15156688, 59148, 456183549, 1780227, -681730119, -2660408, 373072124, 1455890, 681503850, 2659525, 495951789, 1935420, -449207, -1753, 0, 0 -}; #endif diff --git a/crypto_sign/dilithium3/aarch64/packing.c b/crypto_sign/dilithium3/aarch64/packing.c index 8fa3b0cc..779976ec 100644 --- a/crypto_sign/dilithium3/aarch64/packing.c +++ b/crypto_sign/dilithium3/aarch64/packing.c @@ -19,7 +19,7 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1) { unsigned int i; @@ -45,7 +45,7 @@ void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], **************************************************/ void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) { + const uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -71,7 +71,7 @@ void unpack_pk(uint8_t rho[SEEDBYTES], * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 **************************************************/ -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -129,7 +129,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]) { + const uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -172,7 +172,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h **************************************************/ -void pack_sig(uint8_t sig[CRYPTO_BYTES], +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h) { @@ -221,7 +221,7 @@ void pack_sig(uint8_t sig[CRYPTO_BYTES], int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, - const uint8_t sig[CRYPTO_BYTES]) { + const uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES]) { unsigned int i, j, k; for (i = 0; i < CTILDEBYTES; ++i) { diff --git a/crypto_sign/dilithium3/aarch64/packing.h b/crypto_sign/dilithium3/aarch64/packing.h index fb70ce5d..c1fba82a 100644 --- a/crypto_sign/dilithium3/aarch64/packing.h +++ b/crypto_sign/dilithium3/aarch64/packing.h @@ -7,15 +7,16 @@ * or public domain at https://github.com/pq-crystals/dilithium */ +#include "api.h" #include "params.h" #include "polyvec.h" #include #define pack_pk DILITHIUM_NAMESPACE(pack_pk) -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); #define pack_sk DILITHIUM_NAMESPACE(pack_sk) -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -24,10 +25,10 @@ void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], const polyveck *s2); #define pack_sig DILITHIUM_NAMESPACE(pack_sig) -void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); #define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) -void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES]); #define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) void unpack_sk(uint8_t rho[SEEDBYTES], @@ -36,9 +37,9 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + const uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES]); #define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) -int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES]); #endif diff --git a/crypto_sign/dilithium3/aarch64/params.h b/crypto_sign/dilithium3/aarch64/params.h index 922c44d0..fe19f815 100644 --- a/crypto_sign/dilithium3/aarch64/params.h +++ b/crypto_sign/dilithium3/aarch64/params.h @@ -11,8 +11,8 @@ #define DILITHIUM_MODE 3 //#define DILITHIUM_MODE 5 -#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM3_AARCH64_crypto_sign #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM3_AARCH64_##s +#define CRYPTO_NAMESPACETOP crypto_sign #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s) @@ -25,6 +25,20 @@ #define D 13 #define ROOT_OF_UNITY 1753 +#if DILITHIUM_MODE == 2 + +#define K 4 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((DILITHIUM_Q-1)/88) +#define OMEGA 80 +#define CRYPTO_ALGNAME "Dilithium2" +#define CTILDEBYTES 32 +#elif DILITHIUM_MODE == 3 + #define K 6 #define L 5 #define ETA 4 @@ -35,23 +49,54 @@ #define OMEGA 55 #define CRYPTO_ALGNAME "Dilithium3" #define CTILDEBYTES 48 +#elif DILITHIUM_MODE == 5 + +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((DILITHIUM_Q-1)/32) +#define OMEGA 75 +#define CRYPTO_ALGNAME "Dilithium5" +#define CTILDEBYTES 64 +#else + +#error "No parameter specified!" + +#endif + + #define POLYT1_PACKEDBYTES 320 #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) +#if GAMMA1 == (1 << 17) +#define POLYZ_PACKEDBYTES 576 +#elif GAMMA1 == (1 << 19) #define POLYZ_PACKEDBYTES 640 +#endif +#if GAMMA2 == (DILITHIUM_Q-1)/88 +#define POLYW1_PACKEDBYTES 192 +#elif GAMMA2 == (DILITHIUM_Q-1)/32 #define POLYW1_PACKEDBYTES 128 +#endif +#if ETA == 2 +#define POLYETA_PACKEDBYTES 96 +#elif ETA == 4 #define POLYETA_PACKEDBYTES 128 +#endif -#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) -#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ - + TRBYTES \ - + L*POLYETA_PACKEDBYTES \ - + K*POLYETA_PACKEDBYTES \ - + K*POLYT0_PACKEDBYTES) -#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) +#define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define DILITHIUM_CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/crypto_sign/dilithium3/aarch64/poly.c b/crypto_sign/dilithium3/aarch64/poly.c index 1832b641..687e14f9 100644 --- a/crypto_sign/dilithium3/aarch64/poly.c +++ b/crypto_sign/dilithium3/aarch64/poly.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,13 +41,8 @@ #include "fips202x2.h" -#include "NTT_params.h" #include "ntt.h" -static const int32_t montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; - #define DBENCH_START() #define DBENCH_STOP(t) @@ -57,11 +54,11 @@ static const int32_t montgomery_const[4] = { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(int32_t *, const int32_t *); void poly_reduce(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -74,11 +71,11 @@ void poly_reduce(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(int32_t *, const int32_t *); void poly_caddq(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -91,11 +88,11 @@ void poly_caddq(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(int32_t *, const int32_t *); void poly_freeze(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -205,11 +202,11 @@ void poly_invntt_tomont(poly *a) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, constants); DBENCH_STOP(*tmul); } @@ -226,11 +223,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { * - poly *a0: pointer to output polynomial with coefficients c0 * - const poly *a: pointer to input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *); void poly_power2round(poly *a1, poly *a0, const poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); DBENCH_STOP(*tround); } @@ -470,6 +467,19 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; + #if ETA == 2 + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; + } + + #elif ETA == 4 + if (t0 < 9) { a[ctr++] = 4 - t0; } @@ -477,6 +487,12 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 4 - t1; } + #else + +#error "No parameter specified!" + + #endif + } DBENCH_STOP(*tsample); @@ -494,7 +510,11 @@ static unsigned int rej_eta(int32_t *a, * - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ +#if ETA == 2 +#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#elif ETA == 4 #define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#endif void poly_uniform_eta(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { @@ -641,12 +661,37 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); + #if ETA == 2 + + for (i = 0; i < N / 8; ++i) { + t[0] = ETA - a->coeffs[8 * i + 0]; + t[1] = ETA - a->coeffs[8 * i + 1]; + t[2] = ETA - a->coeffs[8 * i + 2]; + t[3] = ETA - a->coeffs[8 * i + 3]; + t[4] = ETA - a->coeffs[8 * i + 4]; + t[5] = ETA - a->coeffs[8 * i + 5]; + t[6] = ETA - a->coeffs[8 * i + 6]; + t[7] = ETA - a->coeffs[8 * i + 7]; + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } + + #elif ETA == 4 + for (i = 0; i < N / 2; ++i) { t[0] = ETA - a->coeffs[2 * i + 0]; t[1] = ETA - a->coeffs[2 * i + 1]; r[i] = t[0] | (t[1] << 4); } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -662,6 +707,30 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); + #if ETA == 2 + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; + } + + #elif ETA == 4 + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[i] & 0x0F; r->coeffs[2 * i + 1] = a[i] >> 4; @@ -669,6 +738,12 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -706,11 +781,11 @@ void polyt1_pack(uint8_t *r, const poly *a) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32(int32_t *, const uint8_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(int32_t *, const uint8_t *); void polyt1_unpack(poly *r, const uint8_t *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32(r->coeffs, a); + PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(r->coeffs, a); DBENCH_STOP(*tpack); } @@ -841,6 +916,30 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); + #if GAMMA1 == (1 << 17) + + for (i = 0; i < N / 4; ++i) { + t[0] = GAMMA1 - a->coeffs[4 * i + 0]; + t[1] = GAMMA1 - a->coeffs[4 * i + 1]; + t[2] = GAMMA1 - a->coeffs[4 * i + 2]; + t[3] = GAMMA1 - a->coeffs[4 * i + 3]; + + r[9 * i + 0] = t[0]; + r[9 * i + 1] = t[0] >> 8; + r[9 * i + 2] = t[0] >> 16; + r[9 * i + 2] |= t[1] << 2; + r[9 * i + 3] = t[1] >> 6; + r[9 * i + 4] = t[1] >> 14; + r[9 * i + 4] |= t[2] << 4; + r[9 * i + 5] = t[2] >> 4; + r[9 * i + 6] = t[2] >> 12; + r[9 * i + 6] |= t[3] << 6; + r[9 * i + 7] = t[3] >> 2; + r[9 * i + 8] = t[3] >> 10; + } + + #elif GAMMA1 == (1 << 19) + for (i = 0; i < N / 2; ++i) { t[0] = GAMMA1 - a->coeffs[2 * i + 0]; t[1] = GAMMA1 - a->coeffs[2 * i + 1]; @@ -853,6 +952,12 @@ void polyz_pack(uint8_t *r, const poly *a) { r[5 * i + 4] = t[1] >> 12; } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -869,6 +974,37 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); + #if GAMMA1 == (1 << 17) + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[9 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; + r->coeffs[4 * i + 0] &= 0x3FFFF; + + r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; + r->coeffs[4 * i + 1] &= 0x3FFFF; + + r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; + r->coeffs[4 * i + 2] &= 0x3FFFF; + + r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; + r->coeffs[4 * i + 3] &= 0x3FFFF; + + r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; + } + + #elif GAMMA1 == (1 << 19) + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; @@ -884,6 +1020,12 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -901,9 +1043,28 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); + #if GAMMA2 == (DILITHIUM_Q-1)/88 + + for (i = 0; i < N / 4; ++i) { + r[3 * i + 0] = a->coeffs[4 * i + 0]; + r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6; + r[3 * i + 1] = a->coeffs[4 * i + 1] >> 2; + r[3 * i + 1] |= a->coeffs[4 * i + 2] << 4; + r[3 * i + 2] = a->coeffs[4 * i + 2] >> 4; + r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2; + } + + #elif GAMMA2 == (DILITHIUM_Q-1)/32 + for (i = 0; i < N / 2; ++i) { r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium3/aarch64/polyvec.c b/crypto_sign/dilithium3/aarch64/polyvec.c index d8d9d2b9..6cd84c83 100644 --- a/crypto_sign/dilithium3/aarch64/polyvec.c +++ b/crypto_sign/dilithium3/aarch64/polyvec.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,17 +32,14 @@ * SOFTWARE. */ +#include + #include "params.h" #include "poly.h" #include "polyvec.h" -#include - +#include "ntt.h" #include "reduce.h" -static const int32_t l_montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; - /************************************************* * Name: expand_mat * @@ -177,11 +176,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { - PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, constants); } /************************************************* diff --git a/crypto_sign/dilithium3/aarch64/polyvec.h b/crypto_sign/dilithium3/aarch64/polyvec.h index dc3377c9..8844ca79 100644 --- a/crypto_sign/dilithium3/aarch64/polyvec.h +++ b/crypto_sign/dilithium3/aarch64/polyvec.h @@ -42,9 +42,12 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v); + #define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) int polyvecl_chknorm(const polyvecl *v, int32_t B); + + /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; diff --git a/crypto_sign/dilithium3/aarch64/rounding.c b/crypto_sign/dilithium3/aarch64/rounding.c index 871c9759..30c97510 100644 --- a/crypto_sign/dilithium3/aarch64/rounding.c +++ b/crypto_sign/dilithium3/aarch64/rounding.c @@ -47,10 +47,22 @@ int32_t decompose(int32_t *a0, int32_t a) { int32_t a1; a1 = (a + 127) >> 7; + #if GAMMA2 == (DILITHIUM_Q-1)/32 a1 = (a1 * 1025 + (1 << 21)) >> 22; a1 &= 15; + #elif GAMMA2 == (DILITHIUM_Q-1)/88 + + a1 = (a1 * 11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; + + #else + +#error "No parameter specified" + + #endif + *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; return a1; @@ -93,9 +105,22 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } + #if GAMMA2 == (DILITHIUM_Q-1)/32 + if (a0 > 0) { return (a1 + 1) & 15; + } else { + return (a1 - 1) & 15; } - return (a1 - 1) & 15; + + #elif GAMMA2 == (DILITHIUM_Q-1)/88 + + if (a0 > 0) { + return (a1 == 43) ? 0 : a1 + 1; + } else { + return (a1 == 0) ? 43 : a1 - 1; + } + + #endif } diff --git a/crypto_sign/dilithium3/aarch64/sign.c b/crypto_sign/dilithium3/aarch64/sign.c index 3565b370..5eb6dee8 100644 --- a/crypto_sign/dilithium3/aarch64/sign.c +++ b/crypto_sign/dilithium3/aarch64/sign.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -90,7 +91,7 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { pack_pk(pk, rho, &t1); /* Compute H(rho, t1) and write secret key */ - shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(tr, TRBYTES, pk, PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES); pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; @@ -139,7 +140,8 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - for (n = 0; n < RNDBYTES; n++) { + + for(n = 0; n < RNDBYTES; n++) { rnd[n] = 0; } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); @@ -210,7 +212,7 @@ int crypto_sign_signature(uint8_t *sig, /* Write signature */ pack_sig(sig, sig, &z, &h); - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES; return 0; } @@ -238,9 +240,9 @@ int crypto_sign(uint8_t *sm, size_t i; for (i = 0; i < mlen; ++i) { - sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + sm[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; } - crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES, mlen, sk); *smlen += mlen; return 0; } @@ -274,7 +276,7 @@ int crypto_sign_verify(const uint8_t *sig, polyveck t1, w1, h; shake256incctx state; - if (siglen != CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES) { return -1; } @@ -287,7 +289,7 @@ int crypto_sign_verify(const uint8_t *sig, } /* Compute CRH(H(rho, t1), msg) */ - shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(mu, CRHBYTES, pk, PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES); shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -353,17 +355,17 @@ int crypto_sign_open(uint8_t *m, const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; - if (crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES; + if (crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES + i]; } return 0; } diff --git a/crypto_sign/dilithium3/aarch64/sign.h b/crypto_sign/dilithium3/aarch64/sign.h index bc8c4265..05e7b5f6 100644 --- a/crypto_sign/dilithium3/aarch64/sign.h +++ b/crypto_sign/dilithium3/aarch64/sign.h @@ -13,6 +13,7 @@ #include #include + #define challenge DILITHIUM_NAMESPACE(challenge) void challenge(poly *c, const uint8_t seed[SEEDBYTES]); @@ -24,7 +25,7 @@ int crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); -#define crypto_sign DILITHIUM_NAMESPACETOP +#define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk); diff --git a/crypto_sign/dilithium3/aarch64/symmetric-shake.c b/crypto_sign/dilithium3/aarch64/symmetric-shake.c index a53074aa..53aab1c9 100644 --- a/crypto_sign/dilithium3/aarch64/symmetric-shake.c +++ b/crypto_sign/dilithium3/aarch64/symmetric-shake.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/crypto_sign/dilithium3/aarch64/symmetric.h b/crypto_sign/dilithium3/aarch64/symmetric.h index 40b928ec..74d21021 100644 --- a/crypto_sign/dilithium3/aarch64/symmetric.h +++ b/crypto_sign/dilithium3/aarch64/symmetric.h @@ -6,8 +6,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -59,6 +60,7 @@ void dilithium_shake256x2_stream_init(keccakx2_state *state, const uint8_t seed[CRHBYTES], uint16_t nonce1, uint16_t nonce2); + #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE diff --git a/crypto_sign/dilithium5/aarch64/LICENSE b/crypto_sign/dilithium5/aarch64/LICENSE index 0e259d42..093b0a7d 100644 --- a/crypto_sign/dilithium5/aarch64/LICENSE +++ b/crypto_sign/dilithium5/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/crypto_sign/dilithium5/aarch64/NTT_params.h b/crypto_sign/dilithium5/aarch64/NTT_params.h index 582c16ed..dc261a2d 100644 --- a/crypto_sign/dilithium5/aarch64/NTT_params.h +++ b/crypto_sign/dilithium5/aarch64/NTT_params.h @@ -2,7 +2,9 @@ #define NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -42,14 +44,14 @@ #define invomegaQ1 731434 // R = 2^32 below // RmodQ1 = 2^32 mod^{+-} Q1 -#define RmodQ1 (-4186625) +#define RmodQ1 -4186625 // Q1prime = Q1^{-1} mod^{+-} 2^32 #define Q1prime 58728449 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 8347681 // invNQ1R2modQ1 = -NTT_N^{-1} 2^32 2^32 mod^{+-} Q1 below -#define invNQ1R2modQ1 (-41978) +#define invNQ1R2modQ1 -41978 // invNQ1R2modQ1_prime = invNQ1R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 #define invNQ1R2modQ1_prime 8395782 // invNQ1R2modQ1_prime_half = (invNQ1R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 @@ -60,11 +62,11 @@ // invNQ1_final_R2modQ1 = -invNQ1R2modQ1 invomegaQ1^{128} mod q #define invNQ1_final_R2modQ1 4404704 // invNQ1_final_R2modQ1_prime = invNQ1_final_R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime (-151046688) +#define invNQ1_final_R2modQ1_prime -151046688 // invNQ1_final_R2modQ1_prime_half = (invNQ1_final_R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime_half (-75523344) +#define invNQ1_final_R2modQ1_prime_half -75523344 // invNQ1_final_R2modQ1_doubleprime = (invNQ1_final_R2modQ1_prime Q1 - invNQ1_final_R2modQ1) / 2^32 -#define invNQ1_final_R2modQ1_doubleprime (-294725) +#define invNQ1_final_R2modQ1_doubleprime -294725 // RmodQ1_prime = -(RmodQ1 + Q1) Q1prime mod^{+-} 2^32 #define RmodQ1_prime 512 diff --git a/crypto_sign/dilithium5/aarch64/__asm_NTT.S b/crypto_sign/dilithium5/aarch64/__asm_NTT.S index c1d25f64..be5a97f5 100644 --- a/crypto_sign/dilithium5/aarch64/__asm_NTT.S +++ b/crypto_sign/dilithium5/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,261 +30,413 @@ #include "macros.inc" -.align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top -PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top: -_PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top: - - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 - - ldr Q, [x2] - - mov table, x1 - - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 - - add src11, src0, #704 - add src12, src0, #768 +#include "params.h" - add src13, src0, #832 - add src14, src0, #896 +.align 2 +.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top +PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top: +_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top: - add src15, src0, #960 + push_simd + Q .req w8 + src .req x0 + counter .req x11 - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 + ldr Q, [x2] mov v20.S[0], Q - ld1 { v1.4S}, [ src1] - ld1 { v3.4S}, [ src3] - ld1 { v5.4S}, [ src5] - ld1 { v7.4S}, [ src7] - ld1 { v9.4S}, [ src9] - ld1 {v11.4S}, [src11] - ld1 {v13.4S}, [src13] - ld1 {v15.4S}, [src15] - - ld1 { v0.4S}, [ src0] - ld1 { v2.4S}, [ src2] - ld1 { v4.4S}, [ src4] - ld1 { v6.4S}, [ src6] - ld1 { v8.4S}, [ src8] - ld1 {v10.4S}, [src10] - ld1 {v12.4S}, [src12] - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + ldr q9, [src, #9*64] + ldr q11, [src, #11*64] + ldr q13, [src, #13*64] + ldr q15, [src, #15*64] + + qq_butterfly_topl \ + v9, v11, v13, v15, v16, v17, v18, v19, v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64 + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) mov counter, #3 _ntt_top_loop: - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mixssl \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + v9, v11, v13, v15, v16, v17, v18, v19, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64, \ + src, \ + q1, q3, q5, q7, \ + #(16+1*64), #(16+3*64), #(16+5*64), #(16+7*64) + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #(16+8*64), #(16+10*64), #(16+12*64), #(16+14*64), \ + src, \ + q0, q2, q4, q6, \ + #(16+0*64), #(16+2*64), #(16+4*64), #(16+6*64) + + add src, src, #16 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) sub counter, counter, #1 cbnz counter, _ntt_top_loop - st1 { v1.4S}, [ src1], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v7.4S}, [ src7], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v11.4S}, [src11], #16 - st1 {v13.4S}, [src13], #16 - st1 {v15.4S}, [src15], #16 - - st1 { v0.4S}, [ src0], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v8.4S}, [ src8], #16 - st1 {v10.4S}, [src10], #16 - st1 {v12.4S}, [src12], #16 - st1 {v14.4S}, [src14], #16 + qq_butterfly_botss \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter - pop_all + pop_simd - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot -PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot: -_PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot: - - push_all - Q .req w20 - src0 .req x0 - des0 .req x1 - src1 .req x2 - des1 .req x3 - table0 .req x28 - table1 .req x27 - counter .req x19 +.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot +PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot: +_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot: + + push_simd + Q .req w8 + src .req x0 + table0 .req x9 + table1 .req x10 + counter .req x11 ldr Q, [x2] add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + ldr q0, [src, #0*16] + ldr q1, [src, #1*16] + ldr q2, [src, #2*16] + ldr q3, [src, #3*16] + + ldr q4, [table0, #0*16] + ldr q5, [table0, #1*16] + ldr q20, [table1, #0*16] + ldr q21, [table1, #1*16] + + dq_butterfly_topl4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 + + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + - add des0, src0, #0 - add des1, src0, #512 + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - mov counter, #8 + str q0, [src, #0*16] + str q2, [src, #2*16] + + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 + + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + mov counter, #3 _ntt_bot_loop: - ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld1 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + trn_4x4_l4 v12, v13, v14, v15, v8, v9, v10, v11, src, q0, q1, q2, q3, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + str q12, [src, #0*16] + str q13, [src, #1*16] - mov v4.S[0], Q + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - dq_butterfly_top v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 - dq_butterfly_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + str q14, [src, #2*16] + str q15, [src, #3*16] + + + add src, src, #64 + + trn_4x4_l4 v28, v29, v30, v31, v24, v25, v26, v27, src, q16, q17, q18, q19, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q28, q29, q30, q31, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + add table0, table0, #128 + add table1, table1, #128 + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 - dq_butterfly_vec_top v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7 - dq_butterfly_vec_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 - dq_butterfly_vec_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) + + str q0, [src, #0*16] + str q2, [src, #2*16] + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 - st4 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st4 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - .unreq Q - .unreq src0 - .unreq des0 - .unreq src1 - .unreq des1 - .unreq table0 - .unreq table1 - .unreq counter - pop_all + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - br lr + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + trn_4x4_s4 v28, v29, v30, v31, v16, v17, v18, v19, src, q12, q13, q14, q15, #0*16, #1*16, #2*16, #3*16 + str q28, [src, #(512+0*16)] + str q29, [src, #(512+1*16)] + str q30, [src, #(512+2*16)] + str q31, [src, #(512+3*16)] + add src, src, #64 + .unreq Q + .unreq src + .unreq table0 + .unreq table1 + .unreq counter + pop_simd + ret diff --git a/crypto_sign/dilithium5/aarch64/__asm_iNTT.S b/crypto_sign/dilithium5/aarch64/__asm_iNTT.S index a8191f5c..559d442a 100644 --- a/crypto_sign/dilithium5/aarch64/__asm_iNTT.S +++ b/crypto_sign/dilithium5/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top -PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: -_PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top +PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top: +_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top: push_all Q .req w20 @@ -41,23 +44,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: invNR2dp .req w25 invNWR2ph .req w26 invNWR2dp .req w27 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 counter .req x19 ldr Q, [x2, #0] @@ -69,77 +56,63 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: ldr invNWR2ph, [x2, #24] ldr invNWR2dp, [x2, #28] - mov table, x1 + ldr q20, [x1, #0*16] + ldr q21, [x1, #1*16] + ldr q22, [x1, #2*16] + ldr q23, [x1, #3*16] + ldr q24, [x1, #4*16] + ldr q25, [x1, #5*16] + ldr q26, [x1, #6*16] + ldr q27, [x1, #7*16] - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 + mov v20.S[0], Q - add src11, src0, #704 - add src12, src0, #768 + ldr q0, [src, # 0*64] + ldr q1, [src, # 1*64] - add src13, src0, #832 - add src14, src0, #896 + ldr q2, [src, # 2*64] + ldr q3, [src, # 3*64] - add src15, src0, #960 + ldr q4, [src, # 4*64] + ldr q5, [src, # 5*64] - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ldr q6, [src, # 6*64] + ldr q7, [src, # 7*64] - mov v20.S[0], Q + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 - ld1 { v0.4S}, [ src0] - ld1 { v1.4S}, [ src1] - ld1 { v2.4S}, [ src2] - ld1 { v3.4S}, [ src3] - ld1 { v4.4S}, [ src4] - ld1 { v5.4S}, [ src5] - ld1 { v6.4S}, [ src6] - ld1 { v7.4S}, [ src7] - - ld1 { v8.4S}, [ src8] - ld1 { v9.4S}, [ src9] - ld1 {v10.4S}, [src10] - ld1 {v11.4S}, [src11] - ld1 {v12.4S}, [src12] - ld1 {v13.4S}, [src13] - ld1 {v14.4S}, [src14] - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + mov counter, #3 + _intt_top_loop: + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -153,77 +126,99 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: sub v17.4S, v17.4S, v19.4S mla v0.4S, v16.4S, v29.4S - mla v1.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v2.4S + mla v1.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v3.4S + + str q0, [src, #0*64] cmge v16.4S, v2.4S, v30.4S + ldr q0, [src, #(16 + 0*64)] + str q1, [src, #1*64] cmge v17.4S, v3.4S, v30.4S + ldr q1, [src, #(16 + 1*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v2.4S, v16.4S, v29.4S - mla v3.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v4.4S + mla v3.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v5.4S + + str q2, [src, #2*64] cmge v16.4S, v4.4S, v30.4S + ldr q2, [src, #(16 + 2*64)] + str q3, [src, #3*64] cmge v17.4S, v5.4S, v30.4S + ldr q3, [src, #(16 + 3*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v4.4S, v16.4S, v29.4S - mla v5.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v6.4S + mla v5.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v7.4S + + str q4, [src, #4*64] cmge v16.4S, v6.4S, v30.4S + ldr q4, [src, #(16 + 4*64)] + str q5, [src, #5*64] cmge v17.4S, v7.4S, v30.4S + ldr q5, [src, #(16 + 5*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v6.4S, v16.4S, v29.4S - mla v7.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v8.4S + mla v7.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v9.4S + + str q6, [src, #6*64] cmge v16.4S, v8.4S, v30.4S + ldr q6, [src, #(16 + 6*64)] + str q7, [src, #7*64] cmge v17.4S, v9.4S, v30.4S + ldr q7, [src, #(16 + 7*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v8.4S, v16.4S, v29.4S - mla v9.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v10.4S + mla v9.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v11.4S + + str q8, [src, #8*64] cmge v16.4S, v10.4S, v30.4S + str q9, [src, #9*64] cmge v17.4S, v11.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v10.4S, v16.4S, v29.4S - mla v11.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v12.4S + mla v11.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v13.4S + + str q10, [src, #10*64] cmge v16.4S, v12.4S, v30.4S + str q11, [src, #11*64] cmge v17.4S, v13.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v12.4S, v16.4S, v29.4S - mla v13.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v14.4S + mla v13.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v15.4S + + str q12, [src, #12*64] cmge v16.4S, v14.4S, v30.4S + str q13, [src, #13*64] cmge v17.4S, v15.4S, v30.4S sub v16.4S, v16.4S, v18.4S @@ -232,66 +227,45 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - mov counter, #3 - _intt_top_loop: - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 + + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 + + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -307,6 +281,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v0.4S, v16.4S, v29.4S mla v1.4S, v17.4S, v29.4S + str q0, [src, #0*64] + str q1, [src, #1*64] + cmge v18.4S, v31.4S, v2.4S cmge v19.4S, v31.4S, v3.4S cmge v16.4S, v2.4S, v30.4S @@ -318,6 +295,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v2.4S, v16.4S, v29.4S mla v3.4S, v17.4S, v29.4S + str q2, [src, #2*64] + str q3, [src, #3*64] + cmge v18.4S, v31.4S, v4.4S cmge v19.4S, v31.4S, v5.4S cmge v16.4S, v4.4S, v30.4S @@ -329,6 +309,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v4.4S, v16.4S, v29.4S mla v5.4S, v17.4S, v29.4S + str q4, [src, #4*64] + str q5, [src, #5*64] + cmge v18.4S, v31.4S, v6.4S cmge v19.4S, v31.4S, v7.4S cmge v16.4S, v6.4S, v30.4S @@ -340,6 +323,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v6.4S, v16.4S, v29.4S mla v7.4S, v17.4S, v29.4S + str q6, [src, #6*64] + str q7, [src, #7*64] + cmge v18.4S, v31.4S, v8.4S cmge v19.4S, v31.4S, v9.4S cmge v16.4S, v8.4S, v30.4S @@ -351,6 +337,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v8.4S, v16.4S, v29.4S mla v9.4S, v17.4S, v29.4S + str q8, [src, #8*64] + str q9, [src, #9*64] + cmge v18.4S, v31.4S, v10.4S cmge v19.4S, v31.4S, v11.4S cmge v16.4S, v10.4S, v30.4S @@ -362,6 +351,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v10.4S, v16.4S, v29.4S mla v11.4S, v17.4S, v29.4S + str q10, [src, #10*64] + str q11, [src, #11*64] + cmge v18.4S, v31.4S, v12.4S cmge v19.4S, v31.4S, v13.4S cmge v16.4S, v12.4S, v30.4S @@ -373,6 +365,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v12.4S, v16.4S, v29.4S mla v13.4S, v17.4S, v29.4S + str q12, [src, #12*64] + str q13, [src, #13*64] + cmge v18.4S, v31.4S, v14.4S cmge v19.4S, v31.4S, v15.4S cmge v16.4S, v14.4S, v30.4S @@ -384,26 +379,11 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - sub counter, counter, #1 - cbnz counter, _intt_top_loop + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 - st1 { v0.4S}, [ src0], #16 - st1 { v1.4S}, [ src1], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v7.4S}, [ src7], #16 - - st1 { v8.4S}, [ src8], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v10.4S}, [src10], #16 - st1 {v11.4S}, [src11], #16 - st1 {v12.4S}, [src12], #16 - st1 {v13.4S}, [src13], #16 - st1 {v14.4S}, [src14], #16 - st1 {v15.4S}, [src15], #16 .unreq Q .unreq Qhalf @@ -412,41 +392,23 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: .unreq invNR2dp .unreq invNWR2ph .unreq invNWR2dp - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot -PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot: -_PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot +PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot: +_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot: push_all Q .req w20 RphRdp .req x21 src0 .req x0 - des0 .req x1 src1 .req x2 - des1 .req x3 table0 .req x28 table1 .req x27 counter .req x19 @@ -457,72 +419,175 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot: add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + add src1, src0, #512 - add des0, src0, #0 - add des1, src0, #512 + ldr q8, [table0, #4*16] + ldr q9, [table0, #5*16] + ldr q10, [table0, #6*16] + ldr q11, [table0, #7*16] - mov counter, #8 - _intt_bot_loop: + ldr q24, [table1, #4*16] + ldr q25, [table1, #5*16] + ldr q26, [table1, #6*16] + ldr q27, [table1, #7*16] + + ldr q0, [src0, # 0*16] + ldr q1, [src0, # 1*16] + + ldr q16, [src1, # 0*16] + ldr q17, [src1, # 1*16] - ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld4 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + ldr q2, [src0, # 2*16] + ldr q3, [src0, # 3*16] - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + ldr q18, [src1, # 2*16] + ldr q19, [src1, # 3*16] + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 mov v4.S[0], Q mov v20.D[0], RphRdp dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 - dq_butterfly_vec_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 - dq_butterfly_vec_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 - dq_butterfly_vec_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_top v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + mov counter, #7 + _intt_bot_loop: + + dq_butterfly_vec_top_ltrn_4x4 \ + v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + table0, \ + q8, q9, q10, q11, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16), \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q24, q25, q26, q27, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16) dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 - dq_butterfly_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + str q2, [src0, # 2*16] srshr v14.4S, v0.4S, #23 + ldr q2, [src0, #(64+ 2*16)] + str q3, [src0, # 3*16] srshr v15.4S, v1.4S, #23 + ldr q3, [src0, #(64+ 3*16)] + str q18, [src1, # 2*16] srshr v30.4S, v16.4S, #23 + ldr q18, [src1, #(64+ 2*16)] + str q19, [src1, # 3*16] srshr v31.4S, v17.4S, #23 + ldr q19, [src1, #(64+ 3*16)] mls v0.4S, v14.4S, v4.S[0] + str q0, [src0, # 0*16] + ldr q0, [src0, #(64+ 0*16)] mls v1.4S, v15.4S, v4.S[0] + str q1, [src0, # 1*16] + ldr q1, [src0, #(64+ 1*16)] mls v16.4S, v30.4S, v4.S[0] + str q16, [src1, # 0*16] + ldr q16, [src1, #(64+ 0*16)] mls v17.4S, v31.4S, v4.S[0] + str q17, [src1, # 1*16] + ldr q17, [src1, #(64+ 1*16)] + + add table0, table0, #128 + add table1, table1, #128 - st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st1 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 + + mov v4.S[0], Q + mov v20.D[0], RphRdp + + dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 sub counter, counter, #1 cbnz counter, _intt_bot_loop + dq_butterfly_vec_top_trn_4x4 \ + v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + + str q2, [src0, # 2*16] + str q3, [src0, # 3*16] + str q18, [src1, # 2*16] + str q19, [src1, # 3*16] + + srshr v14.4S, v0.4S, #23 + srshr v15.4S, v1.4S, #23 + srshr v30.4S, v16.4S, #23 + srshr v31.4S, v17.4S, #23 + + mls v0.4S, v14.4S, v4.S[0] + mls v1.4S, v15.4S, v4.S[0] + mls v16.4S, v30.4S, v4.S[0] + mls v17.4S, v31.4S, v4.S[0] + + str q0, [src0, # 0*16] + str q1, [src0, # 1*16] + str q16, [src1, # 0*16] + str q17, [src1, # 1*16] + + add table0, table0, #128 + add table1, table1, #128 + + add src0, src0, #64 + add src1, src1, #64 + .unreq Q .unreq RphRdp .unreq src0 - .unreq des0 .unreq src1 - .unreq des1 .unreq table0 .unreq table1 .unreq counter pop_all - br lr - - + ret diff --git a/crypto_sign/dilithium5/aarch64/__asm_poly.S b/crypto_sign/dilithium5/aarch64/__asm_poly.S index 49847b98..004d3ff3 100644 --- a/crypto_sign/dilithium5/aarch64/__asm_poly.S +++ b/crypto_sign/dilithium5/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,10 +32,10 @@ #include "params.h" .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32 -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32 -PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: -_PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32 +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32 +PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32: +_PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32: mov x7, #16 _10_to_32_loop: @@ -45,7 +48,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: str w4, [x0], #4 ubfx w5, w2, #20, #10 str w5, [x0], #4 - lsr w6, w2, #30 + lsr w6, w2, #30 ldr w2, [x1], #4 @@ -99,13 +102,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: sub x7, x7, #1 cbnz x7, _10_to_32_loop - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce: ldr w4, [x1] @@ -117,7 +120,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 - + ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 @@ -126,7 +129,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -135,7 +138,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -192,13 +195,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq: ldr w4, [x1] @@ -285,13 +288,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze: ldr w4, [x1] @@ -312,7 +315,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -321,7 +324,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S @@ -330,7 +333,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 - + sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 @@ -339,7 +342,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -414,13 +417,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round: mov w4, #1 @@ -560,13 +563,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round: st1 {v30.4S}, [x1], #16 st1 {v31.4S}, [x1], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -609,13 +612,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -632,7 +635,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: mov x16, #15 _poly_sub_loop: - + st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -658,13 +661,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl: add x1, x0, #0 @@ -725,13 +728,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl: st1 {v22.4S}, [x0], #16 st1 {v23.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery: push_all @@ -769,14 +772,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -819,14 +822,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -843,14 +846,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery -PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: -_PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery +PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery: +_PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery: push_all @@ -910,90 +913,90 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1027,14 +1030,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v27.4S, v23.4S, v31.4S ld1 { v7.4S}, [x2], #16 - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1064,90 +1067,90 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1173,14 +1176,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1194,7 +1197,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: pop_all - br lr + ret diff --git a/crypto_sign/dilithium5/aarch64/api.h b/crypto_sign/dilithium5/aarch64/api.h index 5668ee3c..db2cc3ab 100644 --- a/crypto_sign/dilithium5/aarch64/api.h +++ b/crypto_sign/dilithium5/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef PQCLEAN_DILITHIUM5_AARCH64_API_H -#define PQCLEAN_DILITHIUM5_AARCH64_API_H +#ifndef API_H +#define API_H /* * This file is dual licensed @@ -12,8 +12,8 @@ #define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES 2592 #define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES 4896 -#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES 4627 -#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_ALGNAME "Dilithium5" +#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES 4627 +#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_ALGNAME "Dilithium5" int PQCLEAN_DILITHIUM5_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); diff --git a/crypto_sign/dilithium5/aarch64/feat.S b/crypto_sign/dilithium5/aarch64/feat.S index 01abc10a..f467fa80 100644 --- a/crypto_sign/dilithium5/aarch64/feat.S +++ b/crypto_sign/dilithium5/aarch64/feat.S @@ -123,10 +123,8 @@ SOFTWARE. .endm .align 4 -.global PQCLEAN_DILITHIUM5_AARCH64_f1600x2 -.global _PQCLEAN_DILITHIUM5_AARCH64_f1600x2 -PQCLEAN_DILITHIUM5_AARCH64_f1600x2: -_PQCLEAN_DILITHIUM5_AARCH64_f1600x2: +.global _f1600x2 +_f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! diff --git a/crypto_sign/dilithium5/aarch64/fips202x2.c b/crypto_sign/dilithium5/aarch64/fips202x2.c index 63761d23..e045ee3d 100644 --- a/crypto_sign/dilithium5/aarch64/fips202x2.c +++ b/crypto_sign/dilithium5/aarch64/fips202x2.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -36,6 +37,11 @@ #include #include "fips202x2.h" +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + #define NROUNDS 24 // Define NEON operation @@ -47,20 +53,20 @@ #define vxor(c, a, b) c = veorq_u64(a, b); // Rotate by n bit ((a << offset) ^ (a >> (64-offset))) #define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); // Xor chain: out = a ^ b ^ c ^ d ^ e #define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); // Not And c = ~a & b // #define vbic(c, a, b) c = vbicq_u64(b, a); // Xor Not And: out = a ^ ( (~b) & c) #define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); // Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support #define vrxor(c, a, b) c = vrax1q_u64(a, b); // End Define @@ -100,11 +106,11 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { * * Arguments: - uint64_t *state: pointer to input/output Keccak state **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_f1600x2(v128 *, const uint64_t *); +extern void f1600x2(v128 *, const uint64_t *); static inline void KeccakF1600_StatePermutex2(v128 state[25]) { #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_DILITHIUM5_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); + f1600x2(state, neon_KeccakF_RoundConstants); #else v128 Aba, Abe, Abi, Abo, Abu; v128 Aga, Age, Agi, Ago, Agu; @@ -551,7 +557,14 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -570,7 +583,14 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -587,7 +607,14 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -606,7 +633,14 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -625,6 +659,9 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -644,6 +681,10 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } /************************************************* @@ -662,6 +703,9 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -681,4 +725,8 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1 - t0); + #endif } diff --git a/crypto_sign/dilithium5/aarch64/fips202x2.h b/crypto_sign/dilithium5/aarch64/fips202x2.h index 28babbc3..3066c52b 100644 --- a/crypto_sign/dilithium5/aarch64/fips202x2.h +++ b/crypto_sign/dilithium5/aarch64/fips202x2.h @@ -8,9 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" -#include #include +#include typedef uint64x2_t v128; @@ -23,31 +22,26 @@ typedef struct { v128 s[25]; } keccakx2_state; -#define shake128x2_absorb DILITHIUM_NAMESPACE(shake128x2_absorb) void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake128x2_squeezeblocks DILITHIUM_NAMESPACE(shake128x2_squeezeblocks) void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake256x2_absorb DILITHIUM_NAMESPACE(shake256x2_absorb) void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake256x2_squeezeblocks DILITHIUM_NAMESPACE(shake256x2_squeezeblocks) void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake128x2 DILITHIUM_NAMESPACE(shake128x2) void shake128x2(uint8_t *out0, uint8_t *out1, size_t outlen, @@ -55,11 +49,11 @@ void shake128x2(uint8_t *out0, const uint8_t *in1, size_t inlen); -#define shake256x2 DILITHIUM_NAMESPACE(shake256x2) void shake256x2(uint8_t *out0, uint8_t *out1, size_t outlen, const uint8_t *in0, const uint8_t *in1, size_t inlen); + #endif diff --git a/crypto_sign/dilithium5/aarch64/macros.inc b/crypto_sign/dilithium5/aarch64/macros.inc index ef3af4c5..5504405c 100644 --- a/crypto_sign/dilithium5/aarch64/macros.inc +++ b/crypto_sign/dilithium5/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,24 +30,254 @@ #include "macros_common.inc" -.macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - trn1 \t0\qS, \a0\qS, \a1\qS - trn2 \t1\qS, \a0\qS, \a1\qS - trn1 \t2\qS, \a2\qS, \a3\qS - trn2 \t3\qS, \a2\qS, \a3\qS + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S - trn1 \a0\dD, \t0\dD, \t2\dD - trn2 \a2\dD, \t0\dD, \t2\dD - trn1 \a1\dD, \t1\dD, \t3\dD - trn2 \a3\dD, \t1\dD, \t3\dD + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D .endm -.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - wrap_trn_4x4 \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, .4S, .2D + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + +.macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + +.macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 .endm +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + + +.macro do_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== .macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S @@ -54,12 +287,20 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 .endm -.macro dq_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S .endm @@ -67,16 +308,32 @@ wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + .macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 .endm -.macro dq_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm @@ -89,16 +346,48 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + .macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm -.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 .endm -.macro qq_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S .endm @@ -109,3 +398,5 @@ .macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S .endm + +// === 32-bit end ==== diff --git a/crypto_sign/dilithium5/aarch64/macros_common.inc b/crypto_sign/dilithium5/aarch64/macros_common.inc index bd7e77eb..07568491 100644 --- a/crypto_sign/dilithium5/aarch64/macros_common.inc +++ b/crypto_sign/dilithium5/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,35 +28,58 @@ * SOFTWARE. */ +#ifndef MACROS_COMMON +#define MACROS_COMMON + // for ABI .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -72,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -82,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -99,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -135,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -149,7 +310,134 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t4\wX, \b4\wX, \z4\nX[\h4] @@ -176,7 +464,186 @@ .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] + mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] + mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] + mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -218,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -228,15 +769,82 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] sub \b1\wX, \a1\wX, \t1\wX mul \t3\wX, \b3\wX, \h3\wX + ldr \c2, [\srcc_ptr, \memc2] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b3\wX, \b3\wX, \l3\wX @@ -245,7 +853,7 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX @@ -262,57 +870,98 @@ .endm +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + // vector-scalar Barrett reduction .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -391,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX @@ -448,3 +1191,10 @@ add \s3\wX, \a3\wX, \b3\wX .endm + + +#endif + + + + diff --git a/crypto_sign/dilithium5/aarch64/ntt.c b/crypto_sign/dilithium5/aarch64/ntt.c index 2d88c5d5..92d92313 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.c +++ b/crypto_sign/dilithium5/aarch64/ntt.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,13 +33,28 @@ */ #include "params.h" -#include "reduce.h" #include #include #include "NTT_params.h" #include "ntt.h" +const __attribute__ ((aligned (16)))int32_t constants[16] = { + Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, + invNQ1R2modQ1_prime_half, + invNQ1R2modQ1_doubleprime, + invNQ1_final_R2modQ1_prime_half, + invNQ1_final_R2modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { +0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { +0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 +}; + /************************************************* * Name: ntt * diff --git a/crypto_sign/dilithium5/aarch64/ntt.h b/crypto_sign/dilithium5/aarch64/ntt.h index 79209c37..6797322b 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.h +++ b/crypto_sign/dilithium5/aarch64/ntt.h @@ -6,12 +6,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,45 +34,39 @@ * SOFTWARE. */ -#include "NTT_params.h" -#include "params.h" #include +#include "params.h" +#include "NTT_params.h" + +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants); +extern +const int32_t constants[16]; -#define NTT(in) { \ - PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define iNTT(in) { \ - PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; + +#define NTT(in) do { \ + PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + } while(0) #define ntt DILITHIUM_NAMESPACE(ntt) void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); -static const int constants[16] = { - Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, - invNQ1R2modQ1_prime_half, - invNQ1R2modQ1_doubleprime, - invNQ1_final_R2modQ1_prime_half, - invNQ1_final_R2modQ1_doubleprime -}; - -static const int streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 0, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 3073009, 327391679, 1277625, -675340520, -2635473, 987079667, 3852015, 449207, 1753, -495951789, -1935420, -681503850, -2659525, -373072124, -1455890, 681730119, 2660408, -456183549, -1780227, -15156688, -59148, 710479343, 2772600, 0, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, 4183372, -825844983, -3222807, -799869667, -3121440, -70227934, -274060, 302950022, 1182243, 22347069, 87208, 163212680, 636927, -1016110510, -3965306, -1013916752, -3956745, -588452222, -2296397, -841760171, -3284915, -952468207, -3716946, 0, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 2508980, 519705671, 2028118, 496502727, 1937570, -977780347, -3815725, -7126831, -27812, 210776307, 822541, 258649997, 1009365, -628875181, -2454145, -507246529, -1979497, 409185979, 1596822, -1013967746, -3956944, -963363710, -3759465, 0, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, 2811291, -764594519, -2983781, -284313712, -1109516, 1065510939, 4158088, -431820817, -1685153, -873958779, -3410568, 686309310, 2678278, -965793731, -3768948, -909946047, -3551006, 162963861, 635956, -64176841, -250446, -629190881, -2455377, 0, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 1528066, 123678909, 482649, 294395108, 1148858, -759080783, -2962264, -1062481036, -4146264, -454226054, -1772588, 561940831, 2192938, -442566669, -1727088, 611800717, 2387513, -925511710, -3611750, -68791907, -268456, -814992530, -3180456, 0, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, -565603, 43482586, 169688, 631001801, 2462444, -854436357, -3334383, 960233614, 3747250, 588375860, 2296099, 317727459, 1239911, -983611064, -3838479, 818892658, 3195676, 677264190, 2642980, 321386456, 1254190, -3181859, -12417, 0, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -4166425, -893898890, -3488383, 509377762, 1987814, -819295484, -3197248, 768294260, 2998219, 36345249, 141835, -22883400, -89301, 643961400, 2513018, -347191365, -1354892, 157142369, 613238, -335754661, -1310261, -568482643, -2218467, 0, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 1736313, 60323094, 235407, -832852657, -3250154, 834980303, 3258457, -117552223, -458740, -492511373, -1921994, 1035301089, 4040196, -889718424, -3472069, 522531086, 2039144, -481719139, -1879878, -209807681, -818761, -558360247, -2178965, 0, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, -2579253, 458160776, 1787943, -612717067, -2391089, -577774276, -2254727, -415984810, -1623354, 539479988, 2105286, -608441020, -2374402, -521163479, -2033807, 150224382, 586241, -302276083, -1179613, 135295244, 527981, -702999655, -2743411, 0, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, 3482206, -1071872863, -4182915, -333129378, -1300016, -605279149, -2362063, -378477722, -1476985, 510974714, 1994046, 638402564, 2491325, -356997292, -1393159, 130156402, 507927, -304395785, -1187885, -185731180, -724804, -470097680, -1834526, 0, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, -1317678, 630730945, 2461387, 777970524, 3035980, 159173408, 621164, -777397036, -3033742, -86720197, -338420, 678549029, 2647994, 771248568, 3009748, -669544140, -2612853, 1063046068, 4148469, 192079267, 749577, -1030830548, -4022750, 0, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, 3901472, -314332144, -1226661, 749740976, 2925816, 864652284, 3374250, 1020029345, 3980599, 658309618, 2569011, -413979908, -1615530, 441577800, 1723229, 426738094, 1665318, 519685171, 2028038, 298172236, 1163598, -863376927, -3369273, 0, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, 1356448, -711287812, -2775755, 687588511, 2683270, -712065019, -2778788, 1023635298, 3994671, -3043996, -11879, -351195274, -1370517, 773976352, 3020393, 861908357, 3363542, 55063046, 214880, 139752717, 545376, -197425671, -770441, 0, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, -3467665, 592665232, 2312838, -167401858, -653275, -117660617, -459163, 795799901, 3105558, -282732136, -1103344, 130212265, 508145, -141890356, -553718, 220412084, 860144, 879049958, 3430436, 35937555, 140244, -388001774, -1514152, 0, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, 348812, -84011120, -327848, 259126110, 1011223, -603268097, -2354215, -559928242, -2185084, 800464680, 3123762, 604333585, 2358373, -561979013, -2193087, -772445769, -3014420, -439933955, -1716814, 749801963, 2926054, -100631253, -392707, 0, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -3818627, -492577742, -1922253, -573161516, -2236726, 447030292, 1744507, -77645096, -303005, 904878186, 3531229, -1018462631, -3974485, -967019376, -3773731, 486888731, 1900052, -200355636, -781875, 270210213, 1054478, -187430119, -731434, 0, 0 -}; - -static const int streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 0, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, -1744507, 573161516, 2236726, 492577742, 1922253, 978523985, 3818627, 187430119, 731434, -270210213, -1054478, 200355636, 781875, -486888731, -1900052, 967019376, 3773731, 1018462631, 3974485, -904878186, -3531229, 77645096, 303005, 0, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, 2354215, -259126110, -1011223, 84011120, 327848, -89383150, -348812, 100631253, 392707, -749801963, -2926054, 439933955, 1716814, 772445769, 3014420, 561979013, 2193087, -604333585, -2358373, -800464680, -3123762, 559928242, 2185084, 0, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 459163, 167401858, 653275, -592665232, -2312838, 888589898, 3467665, 388001774, 1514152, -35937555, -140244, -879049958, -3430436, -220412084, -860144, 141890356, 553718, -130212265, -508145, 282732136, 1103344, -795799901, -3105558, 0, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, 2778788, -687588511, -2683270, 711287812, 2775755, -347590090, -1356448, 197425671, 770441, -139752717, -545376, -55063046, -214880, -861908357, -3363542, -773976352, -3020393, 351195274, 1370517, 3043996, 11879, -1023635298, -3994671, 0, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -3374250, -749740976, -2925816, 314332144, 1226661, -999753034, -3901472, 863376927, 3369273, -298172236, -1163598, -519685171, -2028038, -426738094, -1665318, -441577800, -1723229, 413979908, 1615530, -658309618, -2569011, -1020029345, -3980599, 0, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -621164, -777970524, -3035980, -630730945, -2461387, 337655269, 1317678, 1030830548, 4022750, -192079267, -749577, -1063046068, -4148469, 669544140, 2612853, -771248568, -3009748, -678549029, -2647994, 86720197, 338420, 777397036, 3033742, 0, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 2362063, 333129378, 1300016, 1071872863, 4182915, -892316032, -3482206, 470097680, 1834526, 185731180, 724804, 304395785, 1187885, -130156402, -507927, 356997292, 1393159, -638402564, -2491325, -510974714, -1994046, 378477722, 1476985, 0, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 2254727, 612717067, 2391089, -458160776, -1787943, 660934133, 2579253, 702999655, 2743411, -135295244, -527981, 302276083, 1179613, -150224382, -586241, 521163479, 2033807, 608441020, 2374402, -539479988, -2105286, 415984810, 1623354, 0, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, -3258457, 832852657, 3250154, -60323094, -235407, -444930577, -1736313, 558360247, 2178965, 209807681, 818761, 481719139, 1879878, -522531086, -2039144, 889718424, 3472069, -1035301089, -4040196, 492511373, 1921994, 117552223, 458740, 0, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, 3197248, -509377762, -1987814, 893898890, 3488383, 1067647297, 4166425, 568482643, 2218467, 335754661, 1310261, -157142369, -613238, 347191365, 1354892, -643961400, -2513018, 22883400, 89301, -36345249, -141835, -768294260, -2998219, 0, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, 3334383, -631001801, -2462444, -43482586, -169688, 144935890, 565603, 3181859, 12417, -321386456, -1254190, -677264190, -2642980, -818892658, -3195676, 983611064, 3838479, -317727459, -1239911, -588375860, -2296099, -960233614, -3747250, 0, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, 2962264, -294395108, -1148858, -123678909, -482649, -391567239, -1528066, 814992530, 3180456, 68791907, 268456, 925511710, 3611750, -611800717, -2387513, 442566669, 1727088, -561940831, -2192938, 454226054, 1772588, 1062481036, 4146264, 0, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, -4158088, 284313712, 1109516, 764594519, 2983781, -720393920, -2811291, 629190881, 2455377, 64176841, 250446, -162963861, -635956, 909946047, 3551006, 965793731, 3768948, -686309310, -2678278, 873958779, 3410568, 431820817, 1685153, 0, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, 3815725, -496502727, -1937570, -519705671, -2028118, -642926661, -2508980, 963363710, 3759465, 1013967746, 3956944, -409185979, -1596822, 507246529, 1979497, 628875181, 2454145, -258649997, -1009365, -210776307, -822541, 7126831, 27812, 0, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 274060, 799869667, 3121440, 825844983, 3222807, -1071989969, -4183372, 952468207, 3716946, 841760171, 3284915, 588452222, 2296397, 1013916752, 3956745, 1016110510, 3965306, -163212680, -636927, -22347069, -87208, -302950022, -1182243, 0, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, -3852015, 675340520, 2635473, -327391679, -1277625, -787459213, -3073009, -710479343, -2772600, 15156688, 59148, 456183549, 1780227, -681730119, -2660408, 373072124, 1455890, 681503850, 2659525, 495951789, 1935420, -449207, -1753, 0, 0 -}; #endif diff --git a/crypto_sign/dilithium5/aarch64/packing.c b/crypto_sign/dilithium5/aarch64/packing.c index 8fa3b0cc..0f033da1 100644 --- a/crypto_sign/dilithium5/aarch64/packing.c +++ b/crypto_sign/dilithium5/aarch64/packing.c @@ -19,7 +19,7 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1) { unsigned int i; @@ -45,7 +45,7 @@ void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], **************************************************/ void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) { + const uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -71,7 +71,7 @@ void unpack_pk(uint8_t rho[SEEDBYTES], * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 **************************************************/ -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -129,7 +129,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]) { + const uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -172,7 +172,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h **************************************************/ -void pack_sig(uint8_t sig[CRYPTO_BYTES], +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h) { @@ -221,7 +221,7 @@ void pack_sig(uint8_t sig[CRYPTO_BYTES], int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, - const uint8_t sig[CRYPTO_BYTES]) { + const uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES]) { unsigned int i, j, k; for (i = 0; i < CTILDEBYTES; ++i) { diff --git a/crypto_sign/dilithium5/aarch64/packing.h b/crypto_sign/dilithium5/aarch64/packing.h index fb70ce5d..050dc8e6 100644 --- a/crypto_sign/dilithium5/aarch64/packing.h +++ b/crypto_sign/dilithium5/aarch64/packing.h @@ -7,15 +7,16 @@ * or public domain at https://github.com/pq-crystals/dilithium */ +#include "api.h" #include "params.h" #include "polyvec.h" #include #define pack_pk DILITHIUM_NAMESPACE(pack_pk) -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); #define pack_sk DILITHIUM_NAMESPACE(pack_sk) -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -24,10 +25,10 @@ void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], const polyveck *s2); #define pack_sig DILITHIUM_NAMESPACE(pack_sig) -void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); #define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) -void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES]); #define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) void unpack_sk(uint8_t rho[SEEDBYTES], @@ -36,9 +37,9 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + const uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES]); #define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) -int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES]); #endif diff --git a/crypto_sign/dilithium5/aarch64/params.h b/crypto_sign/dilithium5/aarch64/params.h index 8c7addde..03e5322f 100644 --- a/crypto_sign/dilithium5/aarch64/params.h +++ b/crypto_sign/dilithium5/aarch64/params.h @@ -11,8 +11,8 @@ //#define DILITHIUM_MODE 3 #define DILITHIUM_MODE 5 -#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM5_AARCH64_crypto_sign #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM5_AARCH64_##s +#define CRYPTO_NAMESPACETOP crypto_sign #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s) @@ -25,6 +25,32 @@ #define D 13 #define ROOT_OF_UNITY 1753 +#if DILITHIUM_MODE == 2 + +#define K 4 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((DILITHIUM_Q-1)/88) +#define OMEGA 80 +#define CRYPTO_ALGNAME "Dilithium2" +#define CTILDEBYTES 32 +#elif DILITHIUM_MODE == 3 + +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((DILITHIUM_Q-1)/32) +#define OMEGA 55 +#define CRYPTO_ALGNAME "Dilithium3" +#define CTILDEBYTES 48 +#elif DILITHIUM_MODE == 5 + #define K 8 #define L 7 #define ETA 2 @@ -35,23 +61,42 @@ #define OMEGA 75 #define CRYPTO_ALGNAME "Dilithium5" #define CTILDEBYTES 64 +#else + +#error "No parameter specified!" + +#endif + + #define POLYT1_PACKEDBYTES 320 #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) +#if GAMMA1 == (1 << 17) +#define POLYZ_PACKEDBYTES 576 +#elif GAMMA1 == (1 << 19) #define POLYZ_PACKEDBYTES 640 +#endif +#if GAMMA2 == (DILITHIUM_Q-1)/88 +#define POLYW1_PACKEDBYTES 192 +#elif GAMMA2 == (DILITHIUM_Q-1)/32 #define POLYW1_PACKEDBYTES 128 +#endif +#if ETA == 2 #define POLYETA_PACKEDBYTES 96 +#elif ETA == 4 +#define POLYETA_PACKEDBYTES 128 +#endif -#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) -#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ - + TRBYTES \ - + L*POLYETA_PACKEDBYTES \ - + K*POLYETA_PACKEDBYTES \ - + K*POLYT0_PACKEDBYTES) -#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) +#define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define DILITHIUM_CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/crypto_sign/dilithium5/aarch64/poly.c b/crypto_sign/dilithium5/aarch64/poly.c index 782e725a..84b4487e 100644 --- a/crypto_sign/dilithium5/aarch64/poly.c +++ b/crypto_sign/dilithium5/aarch64/poly.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,13 +41,8 @@ #include "fips202x2.h" -#include "NTT_params.h" #include "ntt.h" -static const int32_t montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; - #define DBENCH_START() #define DBENCH_STOP(t) @@ -57,11 +54,11 @@ static const int32_t montgomery_const[4] = { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(int32_t *, const int32_t *); void poly_reduce(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -74,11 +71,11 @@ void poly_reduce(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(int32_t *, const int32_t *); void poly_caddq(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -91,11 +88,11 @@ void poly_caddq(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(int32_t *, const int32_t *); void poly_freeze(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -205,11 +202,11 @@ void poly_invntt_tomont(poly *a) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, constants); DBENCH_STOP(*tmul); } @@ -226,11 +223,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { * - poly *a0: pointer to output polynomial with coefficients c0 * - const poly *a: pointer to input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *); void poly_power2round(poly *a1, poly *a0, const poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); DBENCH_STOP(*tround); } @@ -470,6 +467,8 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; + #if ETA == 2 + if (t0 < 15) { t0 = t0 - (205 * t0 >> 10) * 5; a[ctr++] = 2 - t0; @@ -479,6 +478,21 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 2 - t1; } + #elif ETA == 4 + + if (t0 < 9) { + a[ctr++] = 4 - t0; + } + if (t1 < 9 && ctr < len) { + a[ctr++] = 4 - t1; + } + + #else + +#error "No parameter specified!" + + #endif + } DBENCH_STOP(*tsample); @@ -496,7 +510,11 @@ static unsigned int rej_eta(int32_t *a, * - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ +#if ETA == 2 #define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#elif ETA == 4 +#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#endif void poly_uniform_eta(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { @@ -643,6 +661,8 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); + #if ETA == 2 + for (i = 0; i < N / 8; ++i) { t[0] = ETA - a->coeffs[8 * i + 0]; t[1] = ETA - a->coeffs[8 * i + 1]; @@ -658,6 +678,20 @@ void polyeta_pack(uint8_t *r, const poly *a) { r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } + #elif ETA == 4 + + for (i = 0; i < N / 2; ++i) { + t[0] = ETA - a->coeffs[2 * i + 0]; + t[1] = ETA - a->coeffs[2 * i + 1]; + r[i] = t[0] | (t[1] << 4); + } + + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -673,6 +707,8 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); + #if ETA == 2 + for (i = 0; i < N / 8; ++i) { r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; @@ -693,6 +729,21 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } + #elif ETA == 4 + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[i] & 0x0F; + r->coeffs[2 * i + 1] = a[i] >> 4; + r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; + } + + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -730,11 +781,11 @@ void polyt1_pack(uint8_t *r, const poly *a) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32(int32_t *, const uint8_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(int32_t *, const uint8_t *); void polyt1_unpack(poly *r, const uint8_t *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32(r->coeffs, a); + PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(r->coeffs, a); DBENCH_STOP(*tpack); } @@ -865,6 +916,30 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); + #if GAMMA1 == (1 << 17) + + for (i = 0; i < N / 4; ++i) { + t[0] = GAMMA1 - a->coeffs[4 * i + 0]; + t[1] = GAMMA1 - a->coeffs[4 * i + 1]; + t[2] = GAMMA1 - a->coeffs[4 * i + 2]; + t[3] = GAMMA1 - a->coeffs[4 * i + 3]; + + r[9 * i + 0] = t[0]; + r[9 * i + 1] = t[0] >> 8; + r[9 * i + 2] = t[0] >> 16; + r[9 * i + 2] |= t[1] << 2; + r[9 * i + 3] = t[1] >> 6; + r[9 * i + 4] = t[1] >> 14; + r[9 * i + 4] |= t[2] << 4; + r[9 * i + 5] = t[2] >> 4; + r[9 * i + 6] = t[2] >> 12; + r[9 * i + 6] |= t[3] << 6; + r[9 * i + 7] = t[3] >> 2; + r[9 * i + 8] = t[3] >> 10; + } + + #elif GAMMA1 == (1 << 19) + for (i = 0; i < N / 2; ++i) { t[0] = GAMMA1 - a->coeffs[2 * i + 0]; t[1] = GAMMA1 - a->coeffs[2 * i + 1]; @@ -877,6 +952,12 @@ void polyz_pack(uint8_t *r, const poly *a) { r[5 * i + 4] = t[1] >> 12; } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -893,6 +974,37 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); + #if GAMMA1 == (1 << 17) + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[9 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; + r->coeffs[4 * i + 0] &= 0x3FFFF; + + r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; + r->coeffs[4 * i + 1] &= 0x3FFFF; + + r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; + r->coeffs[4 * i + 2] &= 0x3FFFF; + + r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; + r->coeffs[4 * i + 3] &= 0x3FFFF; + + r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; + } + + #elif GAMMA1 == (1 << 19) + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; @@ -908,6 +1020,12 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } @@ -925,9 +1043,28 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); + #if GAMMA2 == (DILITHIUM_Q-1)/88 + + for (i = 0; i < N / 4; ++i) { + r[3 * i + 0] = a->coeffs[4 * i + 0]; + r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6; + r[3 * i + 1] = a->coeffs[4 * i + 1] >> 2; + r[3 * i + 1] |= a->coeffs[4 * i + 2] << 4; + r[3 * i + 2] = a->coeffs[4 * i + 2] >> 4; + r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2; + } + + #elif GAMMA2 == (DILITHIUM_Q-1)/32 + for (i = 0; i < N / 2; ++i) { r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); } + #else + +#error "No parameter specified!" + + #endif + DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium5/aarch64/polyvec.c b/crypto_sign/dilithium5/aarch64/polyvec.c index e6ec99f9..9218650c 100644 --- a/crypto_sign/dilithium5/aarch64/polyvec.c +++ b/crypto_sign/dilithium5/aarch64/polyvec.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,17 +32,14 @@ * SOFTWARE. */ +#include + #include "params.h" #include "poly.h" #include "polyvec.h" -#include - +#include "ntt.h" #include "reduce.h" -static const int32_t l_montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; - /************************************************* * Name: expand_mat * @@ -177,11 +176,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { - PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, constants); } /************************************************* diff --git a/crypto_sign/dilithium5/aarch64/polyvec.h b/crypto_sign/dilithium5/aarch64/polyvec.h index dc3377c9..8844ca79 100644 --- a/crypto_sign/dilithium5/aarch64/polyvec.h +++ b/crypto_sign/dilithium5/aarch64/polyvec.h @@ -42,9 +42,12 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v); + #define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) int polyvecl_chknorm(const polyvecl *v, int32_t B); + + /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; diff --git a/crypto_sign/dilithium5/aarch64/rounding.c b/crypto_sign/dilithium5/aarch64/rounding.c index 871c9759..30c97510 100644 --- a/crypto_sign/dilithium5/aarch64/rounding.c +++ b/crypto_sign/dilithium5/aarch64/rounding.c @@ -47,10 +47,22 @@ int32_t decompose(int32_t *a0, int32_t a) { int32_t a1; a1 = (a + 127) >> 7; + #if GAMMA2 == (DILITHIUM_Q-1)/32 a1 = (a1 * 1025 + (1 << 21)) >> 22; a1 &= 15; + #elif GAMMA2 == (DILITHIUM_Q-1)/88 + + a1 = (a1 * 11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; + + #else + +#error "No parameter specified" + + #endif + *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; return a1; @@ -93,9 +105,22 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } + #if GAMMA2 == (DILITHIUM_Q-1)/32 + if (a0 > 0) { return (a1 + 1) & 15; + } else { + return (a1 - 1) & 15; } - return (a1 - 1) & 15; + + #elif GAMMA2 == (DILITHIUM_Q-1)/88 + + if (a0 > 0) { + return (a1 == 43) ? 0 : a1 + 1; + } else { + return (a1 == 0) ? 43 : a1 - 1; + } + + #endif } diff --git a/crypto_sign/dilithium5/aarch64/sign.c b/crypto_sign/dilithium5/aarch64/sign.c index 3565b370..4b0be0f5 100644 --- a/crypto_sign/dilithium5/aarch64/sign.c +++ b/crypto_sign/dilithium5/aarch64/sign.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -90,7 +91,7 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { pack_pk(pk, rho, &t1); /* Compute H(rho, t1) and write secret key */ - shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(tr, TRBYTES, pk, PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES); pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; @@ -139,7 +140,8 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - for (n = 0; n < RNDBYTES; n++) { + + for(n = 0; n < RNDBYTES; n++) { rnd[n] = 0; } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); @@ -210,7 +212,7 @@ int crypto_sign_signature(uint8_t *sig, /* Write signature */ pack_sig(sig, sig, &z, &h); - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES; return 0; } @@ -238,9 +240,9 @@ int crypto_sign(uint8_t *sm, size_t i; for (i = 0; i < mlen; ++i) { - sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + sm[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; } - crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES, mlen, sk); *smlen += mlen; return 0; } @@ -274,7 +276,7 @@ int crypto_sign_verify(const uint8_t *sig, polyveck t1, w1, h; shake256incctx state; - if (siglen != CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES) { return -1; } @@ -287,7 +289,7 @@ int crypto_sign_verify(const uint8_t *sig, } /* Compute CRH(H(rho, t1), msg) */ - shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(mu, CRHBYTES, pk, PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES); shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -353,17 +355,17 @@ int crypto_sign_open(uint8_t *m, const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; - if (crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES; + if (crypto_sign_verify(sm, PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES + i]; } return 0; } diff --git a/crypto_sign/dilithium5/aarch64/sign.h b/crypto_sign/dilithium5/aarch64/sign.h index bc8c4265..05e7b5f6 100644 --- a/crypto_sign/dilithium5/aarch64/sign.h +++ b/crypto_sign/dilithium5/aarch64/sign.h @@ -13,6 +13,7 @@ #include #include + #define challenge DILITHIUM_NAMESPACE(challenge) void challenge(poly *c, const uint8_t seed[SEEDBYTES]); @@ -24,7 +25,7 @@ int crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); -#define crypto_sign DILITHIUM_NAMESPACETOP +#define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk); diff --git a/crypto_sign/dilithium5/aarch64/symmetric-shake.c b/crypto_sign/dilithium5/aarch64/symmetric-shake.c index a53074aa..53aab1c9 100644 --- a/crypto_sign/dilithium5/aarch64/symmetric-shake.c +++ b/crypto_sign/dilithium5/aarch64/symmetric-shake.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/crypto_sign/dilithium5/aarch64/symmetric.h b/crypto_sign/dilithium5/aarch64/symmetric.h index 40b928ec..74d21021 100644 --- a/crypto_sign/dilithium5/aarch64/symmetric.h +++ b/crypto_sign/dilithium5/aarch64/symmetric.h @@ -6,8 +6,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -59,6 +60,7 @@ void dilithium_shake256x2_stream_init(keccakx2_state *state, const uint8_t seed[CRHBYTES], uint16_t nonce1, uint16_t nonce2); + #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE From 58ad70da52744657c1835205f5be61b83c3ffc74 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 16:59:25 +0100 Subject: [PATCH 62/85] rm some #ifdef --- crypto_kem/kyber1024/aarch64/cbd.c | 4 -- crypto_kem/kyber1024/aarch64/feat.S | 2 + crypto_kem/kyber1024/aarch64/fips202x2.c | 46 ---------------------- crypto_kem/kyber512/aarch64/cbd.c | 4 -- crypto_kem/kyber512/aarch64/feat.S | 2 + crypto_kem/kyber512/aarch64/fips202x2.c | 46 ---------------------- crypto_kem/kyber768/aarch64/cbd.c | 4 -- crypto_kem/kyber768/aarch64/feat.S | 2 + crypto_kem/kyber768/aarch64/fips202x2.c | 46 ---------------------- crypto_sign/dilithium2/aarch64/feat.S | 2 + crypto_sign/dilithium2/aarch64/fips202x2.c | 46 ---------------------- crypto_sign/dilithium3/aarch64/feat.S | 2 + crypto_sign/dilithium3/aarch64/fips202x2.c | 46 ---------------------- crypto_sign/dilithium5/aarch64/feat.S | 2 + crypto_sign/dilithium5/aarch64/fips202x2.c | 46 ---------------------- 15 files changed, 12 insertions(+), 288 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/cbd.c b/crypto_kem/kyber1024/aarch64/cbd.c index a96d0516..c0e9fe18 100644 --- a/crypto_kem/kyber1024/aarch64/cbd.c +++ b/crypto_kem/kyber1024/aarch64/cbd.c @@ -180,9 +180,5 @@ void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { - #if KYBER_ETA2 == 2 neon_cbd2(r, buf); - #else -#error "This implementation requires eta2 = 2" - #endif } diff --git a/crypto_kem/kyber1024/aarch64/feat.S b/crypto_kem/kyber1024/aarch64/feat.S index f467fa80..6c8e60be 100644 --- a/crypto_kem/kyber1024/aarch64/feat.S +++ b/crypto_kem/kyber1024/aarch64/feat.S @@ -123,7 +123,9 @@ SOFTWARE. .endm .align 4 +.global f1600x2 .global _f1600x2 +f1600x2: _f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! diff --git a/crypto_kem/kyber1024/aarch64/fips202x2.c b/crypto_kem/kyber1024/aarch64/fips202x2.c index e045ee3d..c8ebcd36 100644 --- a/crypto_kem/kyber1024/aarch64/fips202x2.c +++ b/crypto_kem/kyber1024/aarch64/fips202x2.c @@ -37,10 +37,6 @@ #include #include "fips202x2.h" -#ifdef PROFILE_HASHING -#include "hal.h" -extern unsigned long long hash_cycles; -#endif #define NROUNDS 24 @@ -557,14 +553,7 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -583,14 +572,7 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -607,14 +589,7 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -633,14 +608,7 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -659,9 +627,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -681,10 +646,6 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -703,9 +664,6 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -725,8 +683,4 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } diff --git a/crypto_kem/kyber512/aarch64/cbd.c b/crypto_kem/kyber512/aarch64/cbd.c index a96d0516..c0e9fe18 100644 --- a/crypto_kem/kyber512/aarch64/cbd.c +++ b/crypto_kem/kyber512/aarch64/cbd.c @@ -180,9 +180,5 @@ void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { - #if KYBER_ETA2 == 2 neon_cbd2(r, buf); - #else -#error "This implementation requires eta2 = 2" - #endif } diff --git a/crypto_kem/kyber512/aarch64/feat.S b/crypto_kem/kyber512/aarch64/feat.S index f467fa80..6c8e60be 100644 --- a/crypto_kem/kyber512/aarch64/feat.S +++ b/crypto_kem/kyber512/aarch64/feat.S @@ -123,7 +123,9 @@ SOFTWARE. .endm .align 4 +.global f1600x2 .global _f1600x2 +f1600x2: _f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! diff --git a/crypto_kem/kyber512/aarch64/fips202x2.c b/crypto_kem/kyber512/aarch64/fips202x2.c index e045ee3d..c8ebcd36 100644 --- a/crypto_kem/kyber512/aarch64/fips202x2.c +++ b/crypto_kem/kyber512/aarch64/fips202x2.c @@ -37,10 +37,6 @@ #include #include "fips202x2.h" -#ifdef PROFILE_HASHING -#include "hal.h" -extern unsigned long long hash_cycles; -#endif #define NROUNDS 24 @@ -557,14 +553,7 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -583,14 +572,7 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -607,14 +589,7 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -633,14 +608,7 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -659,9 +627,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -681,10 +646,6 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -703,9 +664,6 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -725,8 +683,4 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } diff --git a/crypto_kem/kyber768/aarch64/cbd.c b/crypto_kem/kyber768/aarch64/cbd.c index a96d0516..c0e9fe18 100644 --- a/crypto_kem/kyber768/aarch64/cbd.c +++ b/crypto_kem/kyber768/aarch64/cbd.c @@ -180,9 +180,5 @@ void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { - #if KYBER_ETA2 == 2 neon_cbd2(r, buf); - #else -#error "This implementation requires eta2 = 2" - #endif } diff --git a/crypto_kem/kyber768/aarch64/feat.S b/crypto_kem/kyber768/aarch64/feat.S index f467fa80..6c8e60be 100644 --- a/crypto_kem/kyber768/aarch64/feat.S +++ b/crypto_kem/kyber768/aarch64/feat.S @@ -123,7 +123,9 @@ SOFTWARE. .endm .align 4 +.global f1600x2 .global _f1600x2 +f1600x2: _f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! diff --git a/crypto_kem/kyber768/aarch64/fips202x2.c b/crypto_kem/kyber768/aarch64/fips202x2.c index e045ee3d..c8ebcd36 100644 --- a/crypto_kem/kyber768/aarch64/fips202x2.c +++ b/crypto_kem/kyber768/aarch64/fips202x2.c @@ -37,10 +37,6 @@ #include #include "fips202x2.h" -#ifdef PROFILE_HASHING -#include "hal.h" -extern unsigned long long hash_cycles; -#endif #define NROUNDS 24 @@ -557,14 +553,7 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -583,14 +572,7 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -607,14 +589,7 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -633,14 +608,7 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -659,9 +627,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -681,10 +646,6 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -703,9 +664,6 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -725,8 +683,4 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } diff --git a/crypto_sign/dilithium2/aarch64/feat.S b/crypto_sign/dilithium2/aarch64/feat.S index f467fa80..6c8e60be 100644 --- a/crypto_sign/dilithium2/aarch64/feat.S +++ b/crypto_sign/dilithium2/aarch64/feat.S @@ -123,7 +123,9 @@ SOFTWARE. .endm .align 4 +.global f1600x2 .global _f1600x2 +f1600x2: _f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! diff --git a/crypto_sign/dilithium2/aarch64/fips202x2.c b/crypto_sign/dilithium2/aarch64/fips202x2.c index e045ee3d..c8ebcd36 100644 --- a/crypto_sign/dilithium2/aarch64/fips202x2.c +++ b/crypto_sign/dilithium2/aarch64/fips202x2.c @@ -37,10 +37,6 @@ #include #include "fips202x2.h" -#ifdef PROFILE_HASHING -#include "hal.h" -extern unsigned long long hash_cycles; -#endif #define NROUNDS 24 @@ -557,14 +553,7 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -583,14 +572,7 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -607,14 +589,7 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -633,14 +608,7 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -659,9 +627,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -681,10 +646,6 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -703,9 +664,6 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -725,8 +683,4 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } diff --git a/crypto_sign/dilithium3/aarch64/feat.S b/crypto_sign/dilithium3/aarch64/feat.S index f467fa80..6c8e60be 100644 --- a/crypto_sign/dilithium3/aarch64/feat.S +++ b/crypto_sign/dilithium3/aarch64/feat.S @@ -123,7 +123,9 @@ SOFTWARE. .endm .align 4 +.global f1600x2 .global _f1600x2 +f1600x2: _f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! diff --git a/crypto_sign/dilithium3/aarch64/fips202x2.c b/crypto_sign/dilithium3/aarch64/fips202x2.c index e045ee3d..c8ebcd36 100644 --- a/crypto_sign/dilithium3/aarch64/fips202x2.c +++ b/crypto_sign/dilithium3/aarch64/fips202x2.c @@ -37,10 +37,6 @@ #include #include "fips202x2.h" -#ifdef PROFILE_HASHING -#include "hal.h" -extern unsigned long long hash_cycles; -#endif #define NROUNDS 24 @@ -557,14 +553,7 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -583,14 +572,7 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -607,14 +589,7 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -633,14 +608,7 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -659,9 +627,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -681,10 +646,6 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -703,9 +664,6 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -725,8 +683,4 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } diff --git a/crypto_sign/dilithium5/aarch64/feat.S b/crypto_sign/dilithium5/aarch64/feat.S index f467fa80..6c8e60be 100644 --- a/crypto_sign/dilithium5/aarch64/feat.S +++ b/crypto_sign/dilithium5/aarch64/feat.S @@ -123,7 +123,9 @@ SOFTWARE. .endm .align 4 +.global f1600x2 .global _f1600x2 +f1600x2: _f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! diff --git a/crypto_sign/dilithium5/aarch64/fips202x2.c b/crypto_sign/dilithium5/aarch64/fips202x2.c index e045ee3d..c8ebcd36 100644 --- a/crypto_sign/dilithium5/aarch64/fips202x2.c +++ b/crypto_sign/dilithium5/aarch64/fips202x2.c @@ -37,10 +37,6 @@ #include #include "fips202x2.h" -#ifdef PROFILE_HASHING -#include "hal.h" -extern unsigned long long hash_cycles; -#endif #define NROUNDS 24 @@ -557,14 +553,7 @@ void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -583,14 +572,7 @@ void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -607,14 +589,7 @@ void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -633,14 +608,7 @@ void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -659,9 +627,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[2][SHAKE128_RATE]; @@ -681,10 +646,6 @@ void shake128x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } /************************************************* @@ -703,9 +664,6 @@ void shake256x2(uint8_t *out0, const uint8_t *in0, const uint8_t *in1, size_t inlen) { - #ifdef PROFILE_HASHING - uint64_t t0 = hal_get_time(); - #endif unsigned int i; size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[2][SHAKE256_RATE]; @@ -725,8 +683,4 @@ void shake256x2(uint8_t *out0, out1[i] = t[1][i]; } } - #ifdef PROFILE_HASHING - uint64_t t1 = hal_get_time(); - hash_cycles += (t1 - t0); - #endif } From 6a9e91cea97ff49c921f16195792d45138195852 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 17:13:28 +0100 Subject: [PATCH 63/85] more namespacing --- crypto_kem/kyber1024/aarch64/NTT_params.h | 20 ++++++++++---------- crypto_kem/kyber1024/aarch64/cbd.h | 4 ++-- crypto_kem/kyber1024/aarch64/indcpa.h | 4 ++-- crypto_kem/kyber1024/aarch64/kem.h | 4 ++-- crypto_kem/kyber1024/aarch64/ntt.h | 4 ++-- crypto_kem/kyber1024/aarch64/poly.h | 4 ++-- crypto_kem/kyber1024/aarch64/polyvec.h | 4 ++-- crypto_kem/kyber1024/aarch64/reduce.h | 4 ++-- crypto_kem/kyber1024/aarch64/rejsample.h | 4 ++-- crypto_kem/kyber1024/aarch64/symmetric.h | 4 ++-- crypto_kem/kyber1024/aarch64/verify.h | 4 ++-- crypto_kem/kyber512/aarch64/NTT_params.h | 20 ++++++++++---------- crypto_kem/kyber512/aarch64/api.h | 4 ++-- crypto_kem/kyber512/aarch64/cbd.h | 4 ++-- crypto_kem/kyber512/aarch64/indcpa.h | 4 ++-- crypto_kem/kyber512/aarch64/kem.h | 4 ++-- crypto_kem/kyber512/aarch64/ntt.h | 4 ++-- crypto_kem/kyber512/aarch64/params.h | 4 ++-- crypto_kem/kyber512/aarch64/poly.h | 4 ++-- crypto_kem/kyber512/aarch64/polyvec.h | 4 ++-- crypto_kem/kyber512/aarch64/reduce.h | 4 ++-- crypto_kem/kyber512/aarch64/rejsample.h | 4 ++-- crypto_kem/kyber512/aarch64/symmetric.h | 4 ++-- crypto_kem/kyber512/aarch64/verify.h | 4 ++-- crypto_kem/kyber768/aarch64/NTT_params.h | 20 ++++++++++---------- crypto_kem/kyber768/aarch64/cbd.h | 4 ++-- crypto_kem/kyber768/aarch64/indcpa.h | 4 ++-- crypto_kem/kyber768/aarch64/kem.h | 4 ++-- crypto_kem/kyber768/aarch64/ntt.h | 4 ++-- crypto_kem/kyber768/aarch64/poly.h | 4 ++-- crypto_kem/kyber768/aarch64/polyvec.h | 4 ++-- crypto_kem/kyber768/aarch64/reduce.h | 4 ++-- crypto_kem/kyber768/aarch64/rejsample.h | 4 ++-- crypto_kem/kyber768/aarch64/symmetric.h | 4 ++-- crypto_kem/kyber768/aarch64/verify.h | 4 ++-- crypto_sign/dilithium2/aarch64/NTT_params.h | 14 +++++++------- crypto_sign/dilithium2/aarch64/ntt.h | 4 ++-- crypto_sign/dilithium2/aarch64/packing.h | 4 ++-- crypto_sign/dilithium2/aarch64/poly.h | 4 ++-- crypto_sign/dilithium2/aarch64/polyvec.h | 4 ++-- crypto_sign/dilithium2/aarch64/reduce.h | 4 ++-- crypto_sign/dilithium2/aarch64/rounding.h | 4 ++-- crypto_sign/dilithium2/aarch64/sign.h | 4 ++-- crypto_sign/dilithium2/aarch64/symmetric.h | 4 ++-- crypto_sign/dilithium3/aarch64/NTT_params.h | 14 +++++++------- crypto_sign/dilithium3/aarch64/api.h | 4 ++-- crypto_sign/dilithium3/aarch64/ntt.h | 4 ++-- crypto_sign/dilithium3/aarch64/packing.h | 4 ++-- crypto_sign/dilithium3/aarch64/params.h | 4 ++-- crypto_sign/dilithium3/aarch64/poly.h | 4 ++-- crypto_sign/dilithium3/aarch64/polyvec.h | 4 ++-- crypto_sign/dilithium3/aarch64/reduce.h | 4 ++-- crypto_sign/dilithium3/aarch64/rounding.h | 4 ++-- crypto_sign/dilithium3/aarch64/sign.h | 4 ++-- crypto_sign/dilithium3/aarch64/symmetric.h | 4 ++-- crypto_sign/dilithium5/aarch64/NTT_params.h | 14 +++++++------- crypto_sign/dilithium5/aarch64/ntt.h | 4 ++-- crypto_sign/dilithium5/aarch64/packing.h | 4 ++-- crypto_sign/dilithium5/aarch64/poly.h | 4 ++-- crypto_sign/dilithium5/aarch64/polyvec.h | 4 ++-- crypto_sign/dilithium5/aarch64/reduce.h | 4 ++-- crypto_sign/dilithium5/aarch64/rounding.h | 4 ++-- crypto_sign/dilithium5/aarch64/sign.h | 4 ++-- crypto_sign/dilithium5/aarch64/symmetric.h | 4 ++-- 64 files changed, 167 insertions(+), 167 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/NTT_params.h b/crypto_kem/kyber1024/aarch64/NTT_params.h index f2607092..9bf56093 100644 --- a/crypto_kem/kyber1024/aarch64/NTT_params.h +++ b/crypto_kem/kyber1024/aarch64/NTT_params.h @@ -1,5 +1,5 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_KYBER1024_AARCH64_NTT_PARAMS_H +#define PQCLEAN_KYBER1024_AARCH64_NTT_PARAMS_H /* * We offer @@ -42,27 +42,27 @@ #define invomegaQ1 1175 // R = 2^15 below // RmodQ1 = 2^15 mod^{+-} Q1 -#define RmodQ1 -522 +#define RmodQ1 (-522) // Q1prime = Q1^{-1} mod^{+-} 2^15 -#define Q1prime -3327 +#define Q1prime (-3327) // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 3303 // R2modQ1 = 2^16 mod^{+-} Q1 -#define R2modQ1 -1044 +#define R2modQ1 (-1044) // Q1prime2 = -Q1^{-1} mod^{+-} 2^16 #define Q1prime2 3327 // R3modQ1 = -2^32 mod^{+-} Q1 -#define R3modQ1 -1353 +#define R3modQ1 (-1353) // R3modQ1_prime = (R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime -20552 +#define R3modQ1_prime (-20552) // R3modQ1_prime_half = ( (R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime_half -10276 +#define R3modQ1_prime_half (-10276) // R3modQ1_doubleprime (R3modQ1_prime Q1 - (R3modQ1 + Q1)) / 2^16 -#define R3modQ1_doubleprime -1044 +#define R3modQ1_doubleprime (-1044) // invNQ1_R3modQ1 = -NTT_N^{-1} 2^32 mod^{+-} Q1 -#define invNQ1_R3modQ1 -1441 +#define invNQ1_R3modQ1 (-1441) // invNQ1_R3modQ1_prime = (invNQ1_R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 #define invNQ1_R3modQ1_prime 10080 // invNQ1_R3modQ1_prime_half = ( (invNQ1_R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 diff --git a/crypto_kem/kyber1024/aarch64/cbd.h b/crypto_kem/kyber1024/aarch64/cbd.h index 688abf43..ca8ae2b9 100644 --- a/crypto_kem/kyber1024/aarch64/cbd.h +++ b/crypto_kem/kyber1024/aarch64/cbd.h @@ -1,5 +1,5 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER1024_AARCH64_CBD_H +#define PQCLEAN_KYBER1024_AARCH64_CBD_H /* * This file is licensed diff --git a/crypto_kem/kyber1024/aarch64/indcpa.h b/crypto_kem/kyber1024/aarch64/indcpa.h index 30608327..2e4b46d4 100644 --- a/crypto_kem/kyber1024/aarch64/indcpa.h +++ b/crypto_kem/kyber1024/aarch64/indcpa.h @@ -1,5 +1,5 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER1024_AARCH64_INDCPA_H +#define PQCLEAN_KYBER1024_AARCH64_INDCPA_H /* * This file is licensed diff --git a/crypto_kem/kyber1024/aarch64/kem.h b/crypto_kem/kyber1024/aarch64/kem.h index afb78598..d542de09 100644 --- a/crypto_kem/kyber1024/aarch64/kem.h +++ b/crypto_kem/kyber1024/aarch64/kem.h @@ -1,5 +1,5 @@ -#ifndef KEM_H -#define KEM_H +#ifndef PQCLEAN_KYBER1024_AARCH64_KEM_H +#define PQCLEAN_KYBER1024_AARCH64_KEM_H /* * This file is licensed diff --git a/crypto_kem/kyber1024/aarch64/ntt.h b/crypto_kem/kyber1024/aarch64/ntt.h index 4e29cfda..5a18158c 100644 --- a/crypto_kem/kyber1024/aarch64/ntt.h +++ b/crypto_kem/kyber1024/aarch64/ntt.h @@ -1,5 +1,5 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER1024_AARCH64_NTT_H +#define PQCLEAN_KYBER1024_AARCH64_NTT_H /* * We offer diff --git a/crypto_kem/kyber1024/aarch64/poly.h b/crypto_kem/kyber1024/aarch64/poly.h index ae6bf04d..2882f11f 100644 --- a/crypto_kem/kyber1024/aarch64/poly.h +++ b/crypto_kem/kyber1024/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER1024_AARCH64_POLY_H +#define PQCLEAN_KYBER1024_AARCH64_POLY_H /* * This file is licensed diff --git a/crypto_kem/kyber1024/aarch64/polyvec.h b/crypto_kem/kyber1024/aarch64/polyvec.h index 69e7db9c..835db292 100644 --- a/crypto_kem/kyber1024/aarch64/polyvec.h +++ b/crypto_kem/kyber1024/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER1024_AARCH64_POLYVEC_H +#define PQCLEAN_KYBER1024_AARCH64_POLYVEC_H /* * This file was originally licensed diff --git a/crypto_kem/kyber1024/aarch64/reduce.h b/crypto_kem/kyber1024/aarch64/reduce.h index 4a7c3426..ee362244 100644 --- a/crypto_kem/kyber1024/aarch64/reduce.h +++ b/crypto_kem/kyber1024/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER1024_AARCH64_REDUCE_H +#define PQCLEAN_KYBER1024_AARCH64_REDUCE_H /* * This file is licensed diff --git a/crypto_kem/kyber1024/aarch64/rejsample.h b/crypto_kem/kyber1024/aarch64/rejsample.h index 7a9fb471..40d8dce6 100644 --- a/crypto_kem/kyber1024/aarch64/rejsample.h +++ b/crypto_kem/kyber1024/aarch64/rejsample.h @@ -1,5 +1,5 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H +#ifndef PQCLEAN_KYBER1024_AARCH64_REJSAMPLE_H +#define PQCLEAN_KYBER1024_AARCH64_REJSAMPLE_H /* * This file is licensed diff --git a/crypto_kem/kyber1024/aarch64/symmetric.h b/crypto_kem/kyber1024/aarch64/symmetric.h index 2a59b8b8..0c2dd991 100644 --- a/crypto_kem/kyber1024/aarch64/symmetric.h +++ b/crypto_kem/kyber1024/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_KYBER1024_AARCH64_SYMMETRIC_H +#define PQCLEAN_KYBER1024_AARCH64_SYMMETRIC_H /* * This file is licensed diff --git a/crypto_kem/kyber1024/aarch64/verify.h b/crypto_kem/kyber1024/aarch64/verify.h index ac78bc35..4819e0db 100644 --- a/crypto_kem/kyber1024/aarch64/verify.h +++ b/crypto_kem/kyber1024/aarch64/verify.h @@ -1,5 +1,5 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER1024_AARCH64_VERIFY_H +#define PQCLEAN_KYBER1024_AARCH64_VERIFY_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/NTT_params.h b/crypto_kem/kyber512/aarch64/NTT_params.h index f2607092..ccde0122 100644 --- a/crypto_kem/kyber512/aarch64/NTT_params.h +++ b/crypto_kem/kyber512/aarch64/NTT_params.h @@ -1,5 +1,5 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_KYBER512_AARCH64_NTT_PARAMS_H +#define PQCLEAN_KYBER512_AARCH64_NTT_PARAMS_H /* * We offer @@ -42,27 +42,27 @@ #define invomegaQ1 1175 // R = 2^15 below // RmodQ1 = 2^15 mod^{+-} Q1 -#define RmodQ1 -522 +#define RmodQ1 (-522) // Q1prime = Q1^{-1} mod^{+-} 2^15 -#define Q1prime -3327 +#define Q1prime (-3327) // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 3303 // R2modQ1 = 2^16 mod^{+-} Q1 -#define R2modQ1 -1044 +#define R2modQ1 (-1044) // Q1prime2 = -Q1^{-1} mod^{+-} 2^16 #define Q1prime2 3327 // R3modQ1 = -2^32 mod^{+-} Q1 -#define R3modQ1 -1353 +#define R3modQ1 (-1353) // R3modQ1_prime = (R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime -20552 +#define R3modQ1_prime (-20552) // R3modQ1_prime_half = ( (R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime_half -10276 +#define R3modQ1_prime_half (-10276) // R3modQ1_doubleprime (R3modQ1_prime Q1 - (R3modQ1 + Q1)) / 2^16 -#define R3modQ1_doubleprime -1044 +#define R3modQ1_doubleprime (-1044) // invNQ1_R3modQ1 = -NTT_N^{-1} 2^32 mod^{+-} Q1 -#define invNQ1_R3modQ1 -1441 +#define invNQ1_R3modQ1 (-1441) // invNQ1_R3modQ1_prime = (invNQ1_R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 #define invNQ1_R3modQ1_prime 10080 // invNQ1_R3modQ1_prime_half = ( (invNQ1_R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 diff --git a/crypto_kem/kyber512/aarch64/api.h b/crypto_kem/kyber512/aarch64/api.h index 97d81a11..36c00f70 100644 --- a/crypto_kem/kyber512/aarch64/api.h +++ b/crypto_kem/kyber512/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef API_H -#define API_H +#ifndef PQCLEAN_KYBER512_AARCH64_API_H +#define PQCLEAN_KYBER512_AARCH64_API_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/cbd.h b/crypto_kem/kyber512/aarch64/cbd.h index 688abf43..2b3eb2af 100644 --- a/crypto_kem/kyber512/aarch64/cbd.h +++ b/crypto_kem/kyber512/aarch64/cbd.h @@ -1,5 +1,5 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER512_AARCH64_CBD_H +#define PQCLEAN_KYBER512_AARCH64_CBD_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/indcpa.h b/crypto_kem/kyber512/aarch64/indcpa.h index 30608327..0fdac5e2 100644 --- a/crypto_kem/kyber512/aarch64/indcpa.h +++ b/crypto_kem/kyber512/aarch64/indcpa.h @@ -1,5 +1,5 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER512_AARCH64_INDCPA_H +#define PQCLEAN_KYBER512_AARCH64_INDCPA_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/kem.h b/crypto_kem/kyber512/aarch64/kem.h index afb78598..8702ac47 100644 --- a/crypto_kem/kyber512/aarch64/kem.h +++ b/crypto_kem/kyber512/aarch64/kem.h @@ -1,5 +1,5 @@ -#ifndef KEM_H -#define KEM_H +#ifndef PQCLEAN_KYBER512_AARCH64_KEM_H +#define PQCLEAN_KYBER512_AARCH64_KEM_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/ntt.h b/crypto_kem/kyber512/aarch64/ntt.h index aceddc54..141fa225 100644 --- a/crypto_kem/kyber512/aarch64/ntt.h +++ b/crypto_kem/kyber512/aarch64/ntt.h @@ -1,5 +1,5 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER512_AARCH64_NTT_H +#define PQCLEAN_KYBER512_AARCH64_NTT_H /* * We offer diff --git a/crypto_kem/kyber512/aarch64/params.h b/crypto_kem/kyber512/aarch64/params.h index 2b741df9..f4ddb3a5 100644 --- a/crypto_kem/kyber512/aarch64/params.h +++ b/crypto_kem/kyber512/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_KYBER512_AARCH64_PARAMS_H +#define PQCLEAN_KYBER512_AARCH64_PARAMS_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/poly.h b/crypto_kem/kyber512/aarch64/poly.h index ae6bf04d..6ba67e59 100644 --- a/crypto_kem/kyber512/aarch64/poly.h +++ b/crypto_kem/kyber512/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER512_AARCH64_POLY_H +#define PQCLEAN_KYBER512_AARCH64_POLY_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/polyvec.h b/crypto_kem/kyber512/aarch64/polyvec.h index 69e7db9c..3ff4c2e2 100644 --- a/crypto_kem/kyber512/aarch64/polyvec.h +++ b/crypto_kem/kyber512/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER512_AARCH64_POLYVEC_H +#define PQCLEAN_KYBER512_AARCH64_POLYVEC_H /* * This file was originally licensed diff --git a/crypto_kem/kyber512/aarch64/reduce.h b/crypto_kem/kyber512/aarch64/reduce.h index 4a7c3426..c093e84a 100644 --- a/crypto_kem/kyber512/aarch64/reduce.h +++ b/crypto_kem/kyber512/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER512_AARCH64_REDUCE_H +#define PQCLEAN_KYBER512_AARCH64_REDUCE_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/rejsample.h b/crypto_kem/kyber512/aarch64/rejsample.h index 7a9fb471..2442e01d 100644 --- a/crypto_kem/kyber512/aarch64/rejsample.h +++ b/crypto_kem/kyber512/aarch64/rejsample.h @@ -1,5 +1,5 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H +#ifndef PQCLEAN_KYBER512_AARCH64_REJSAMPLE_H +#define PQCLEAN_KYBER512_AARCH64_REJSAMPLE_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/symmetric.h b/crypto_kem/kyber512/aarch64/symmetric.h index 2a59b8b8..3c4c5074 100644 --- a/crypto_kem/kyber512/aarch64/symmetric.h +++ b/crypto_kem/kyber512/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_KYBER512_AARCH64_SYMMETRIC_H +#define PQCLEAN_KYBER512_AARCH64_SYMMETRIC_H /* * This file is licensed diff --git a/crypto_kem/kyber512/aarch64/verify.h b/crypto_kem/kyber512/aarch64/verify.h index ac78bc35..81b6525d 100644 --- a/crypto_kem/kyber512/aarch64/verify.h +++ b/crypto_kem/kyber512/aarch64/verify.h @@ -1,5 +1,5 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER512_AARCH64_VERIFY_H +#define PQCLEAN_KYBER512_AARCH64_VERIFY_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/NTT_params.h b/crypto_kem/kyber768/aarch64/NTT_params.h index f2607092..0d47ff96 100644 --- a/crypto_kem/kyber768/aarch64/NTT_params.h +++ b/crypto_kem/kyber768/aarch64/NTT_params.h @@ -1,5 +1,5 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_KYBER768_AARCH64_NTT_PARAMS_H +#define PQCLEAN_KYBER768_AARCH64_NTT_PARAMS_H /* * We offer @@ -42,27 +42,27 @@ #define invomegaQ1 1175 // R = 2^15 below // RmodQ1 = 2^15 mod^{+-} Q1 -#define RmodQ1 -522 +#define RmodQ1 (-522) // Q1prime = Q1^{-1} mod^{+-} 2^15 -#define Q1prime -3327 +#define Q1prime (-3327) // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 3303 // R2modQ1 = 2^16 mod^{+-} Q1 -#define R2modQ1 -1044 +#define R2modQ1 (-1044) // Q1prime2 = -Q1^{-1} mod^{+-} 2^16 #define Q1prime2 3327 // R3modQ1 = -2^32 mod^{+-} Q1 -#define R3modQ1 -1353 +#define R3modQ1 (-1353) // R3modQ1_prime = (R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime -20552 +#define R3modQ1_prime (-20552) // R3modQ1_prime_half = ( (R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 -#define R3modQ1_prime_half -10276 +#define R3modQ1_prime_half (-10276) // R3modQ1_doubleprime (R3modQ1_prime Q1 - (R3modQ1 + Q1)) / 2^16 -#define R3modQ1_doubleprime -1044 +#define R3modQ1_doubleprime (-1044) // invNQ1_R3modQ1 = -NTT_N^{-1} 2^32 mod^{+-} Q1 -#define invNQ1_R3modQ1 -1441 +#define invNQ1_R3modQ1 (-1441) // invNQ1_R3modQ1_prime = (invNQ1_R3modQ1 + Q1) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 #define invNQ1_R3modQ1_prime 10080 // invNQ1_R3modQ1_prime_half = ( (invNQ1_R3modQ1 + Q1) / 2) (Q1^{-1} mod^{+-} 2^16) mod^{+-} 2^16 diff --git a/crypto_kem/kyber768/aarch64/cbd.h b/crypto_kem/kyber768/aarch64/cbd.h index 688abf43..5389023f 100644 --- a/crypto_kem/kyber768/aarch64/cbd.h +++ b/crypto_kem/kyber768/aarch64/cbd.h @@ -1,5 +1,5 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER768_AARCH64_CBD_H +#define PQCLEAN_KYBER768_AARCH64_CBD_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/indcpa.h b/crypto_kem/kyber768/aarch64/indcpa.h index 30608327..313888c0 100644 --- a/crypto_kem/kyber768/aarch64/indcpa.h +++ b/crypto_kem/kyber768/aarch64/indcpa.h @@ -1,5 +1,5 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER768_AARCH64_INDCPA_H +#define PQCLEAN_KYBER768_AARCH64_INDCPA_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/kem.h b/crypto_kem/kyber768/aarch64/kem.h index afb78598..fb8a731d 100644 --- a/crypto_kem/kyber768/aarch64/kem.h +++ b/crypto_kem/kyber768/aarch64/kem.h @@ -1,5 +1,5 @@ -#ifndef KEM_H -#define KEM_H +#ifndef PQCLEAN_KYBER768_AARCH64_KEM_H +#define PQCLEAN_KYBER768_AARCH64_KEM_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/ntt.h b/crypto_kem/kyber768/aarch64/ntt.h index 3ed9cdcf..cd5fd984 100644 --- a/crypto_kem/kyber768/aarch64/ntt.h +++ b/crypto_kem/kyber768/aarch64/ntt.h @@ -1,5 +1,5 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER768_AARCH64_NTT_H +#define PQCLEAN_KYBER768_AARCH64_NTT_H /* * We offer diff --git a/crypto_kem/kyber768/aarch64/poly.h b/crypto_kem/kyber768/aarch64/poly.h index ae6bf04d..2af01f78 100644 --- a/crypto_kem/kyber768/aarch64/poly.h +++ b/crypto_kem/kyber768/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER768_AARCH64_POLY_H +#define PQCLEAN_KYBER768_AARCH64_POLY_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/polyvec.h b/crypto_kem/kyber768/aarch64/polyvec.h index 69e7db9c..97dcf23c 100644 --- a/crypto_kem/kyber768/aarch64/polyvec.h +++ b/crypto_kem/kyber768/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER768_AARCH64_POLYVEC_H +#define PQCLEAN_KYBER768_AARCH64_POLYVEC_H /* * This file was originally licensed diff --git a/crypto_kem/kyber768/aarch64/reduce.h b/crypto_kem/kyber768/aarch64/reduce.h index 4a7c3426..e16a894b 100644 --- a/crypto_kem/kyber768/aarch64/reduce.h +++ b/crypto_kem/kyber768/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER768_AARCH64_REDUCE_H +#define PQCLEAN_KYBER768_AARCH64_REDUCE_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/rejsample.h b/crypto_kem/kyber768/aarch64/rejsample.h index 7a9fb471..87674209 100644 --- a/crypto_kem/kyber768/aarch64/rejsample.h +++ b/crypto_kem/kyber768/aarch64/rejsample.h @@ -1,5 +1,5 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H +#ifndef PQCLEAN_KYBER768_AARCH64_REJSAMPLE_H +#define PQCLEAN_KYBER768_AARCH64_REJSAMPLE_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/symmetric.h b/crypto_kem/kyber768/aarch64/symmetric.h index 2a59b8b8..336fe4da 100644 --- a/crypto_kem/kyber768/aarch64/symmetric.h +++ b/crypto_kem/kyber768/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_KYBER768_AARCH64_SYMMETRIC_H +#define PQCLEAN_KYBER768_AARCH64_SYMMETRIC_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/verify.h b/crypto_kem/kyber768/aarch64/verify.h index ac78bc35..a52767b3 100644 --- a/crypto_kem/kyber768/aarch64/verify.h +++ b/crypto_kem/kyber768/aarch64/verify.h @@ -1,5 +1,5 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER768_AARCH64_VERIFY_H +#define PQCLEAN_KYBER768_AARCH64_VERIFY_H /* * This file is licensed diff --git a/crypto_sign/dilithium2/aarch64/NTT_params.h b/crypto_sign/dilithium2/aarch64/NTT_params.h index dc261a2d..72ce624d 100644 --- a/crypto_sign/dilithium2/aarch64/NTT_params.h +++ b/crypto_sign/dilithium2/aarch64/NTT_params.h @@ -1,5 +1,5 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_NTT_PARAMS_H +#define PQCLEAN_DILITHIUM2_AARCH64_NTT_PARAMS_H /* * We offer @@ -44,14 +44,14 @@ #define invomegaQ1 731434 // R = 2^32 below // RmodQ1 = 2^32 mod^{+-} Q1 -#define RmodQ1 -4186625 +#define RmodQ1 (-4186625) // Q1prime = Q1^{-1} mod^{+-} 2^32 #define Q1prime 58728449 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 8347681 // invNQ1R2modQ1 = -NTT_N^{-1} 2^32 2^32 mod^{+-} Q1 below -#define invNQ1R2modQ1 -41978 +#define invNQ1R2modQ1 (-41978) // invNQ1R2modQ1_prime = invNQ1R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 #define invNQ1R2modQ1_prime 8395782 // invNQ1R2modQ1_prime_half = (invNQ1R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 @@ -62,11 +62,11 @@ // invNQ1_final_R2modQ1 = -invNQ1R2modQ1 invomegaQ1^{128} mod q #define invNQ1_final_R2modQ1 4404704 // invNQ1_final_R2modQ1_prime = invNQ1_final_R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime -151046688 +#define invNQ1_final_R2modQ1_prime (-151046688) // invNQ1_final_R2modQ1_prime_half = (invNQ1_final_R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime_half -75523344 +#define invNQ1_final_R2modQ1_prime_half (-75523344) // invNQ1_final_R2modQ1_doubleprime = (invNQ1_final_R2modQ1_prime Q1 - invNQ1_final_R2modQ1) / 2^32 -#define invNQ1_final_R2modQ1_doubleprime -294725 +#define invNQ1_final_R2modQ1_doubleprime (-294725) // RmodQ1_prime = -(RmodQ1 + Q1) Q1prime mod^{+-} 2^32 #define RmodQ1_prime 512 diff --git a/crypto_sign/dilithium2/aarch64/ntt.h b/crypto_sign/dilithium2/aarch64/ntt.h index b26b6479..497330a7 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.h +++ b/crypto_sign/dilithium2/aarch64/ntt.h @@ -1,5 +1,5 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_NTT_H +#define PQCLEAN_DILITHIUM2_AARCH64_NTT_H /* * This file was originally licensed diff --git a/crypto_sign/dilithium2/aarch64/packing.h b/crypto_sign/dilithium2/aarch64/packing.h index 9021a864..162b4be3 100644 --- a/crypto_sign/dilithium2/aarch64/packing.h +++ b/crypto_sign/dilithium2/aarch64/packing.h @@ -1,5 +1,5 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_PACKING_H +#define PQCLEAN_DILITHIUM2_AARCH64_PACKING_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium2/aarch64/poly.h b/crypto_sign/dilithium2/aarch64/poly.h index c253ecf6..a6a8936d 100644 --- a/crypto_sign/dilithium2/aarch64/poly.h +++ b/crypto_sign/dilithium2/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_POLY_H +#define PQCLEAN_DILITHIUM2_AARCH64_POLY_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium2/aarch64/polyvec.h b/crypto_sign/dilithium2/aarch64/polyvec.h index 8844ca79..8d8905bc 100644 --- a/crypto_sign/dilithium2/aarch64/polyvec.h +++ b/crypto_sign/dilithium2/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_POLYVEC_H +#define PQCLEAN_DILITHIUM2_AARCH64_POLYVEC_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium2/aarch64/reduce.h b/crypto_sign/dilithium2/aarch64/reduce.h index 9042e6cb..721feb67 100644 --- a/crypto_sign/dilithium2/aarch64/reduce.h +++ b/crypto_sign/dilithium2/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_REDUCE_H +#define PQCLEAN_DILITHIUM2_AARCH64_REDUCE_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium2/aarch64/rounding.h b/crypto_sign/dilithium2/aarch64/rounding.h index 36167d2a..f581543d 100644 --- a/crypto_sign/dilithium2/aarch64/rounding.h +++ b/crypto_sign/dilithium2/aarch64/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_ROUNDING_H +#define PQCLEAN_DILITHIUM2_AARCH64_ROUNDING_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium2/aarch64/sign.h b/crypto_sign/dilithium2/aarch64/sign.h index 05e7b5f6..1a9dbea5 100644 --- a/crypto_sign/dilithium2/aarch64/sign.h +++ b/crypto_sign/dilithium2/aarch64/sign.h @@ -1,5 +1,5 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_SIGN_H +#define PQCLEAN_DILITHIUM2_AARCH64_SIGN_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium2/aarch64/symmetric.h b/crypto_sign/dilithium2/aarch64/symmetric.h index 74d21021..81b7f1a9 100644 --- a/crypto_sign/dilithium2/aarch64/symmetric.h +++ b/crypto_sign/dilithium2/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_SYMMETRIC_H +#define PQCLEAN_DILITHIUM2_AARCH64_SYMMETRIC_H /* * This file was originally licensed diff --git a/crypto_sign/dilithium3/aarch64/NTT_params.h b/crypto_sign/dilithium3/aarch64/NTT_params.h index dc261a2d..053c3677 100644 --- a/crypto_sign/dilithium3/aarch64/NTT_params.h +++ b/crypto_sign/dilithium3/aarch64/NTT_params.h @@ -1,5 +1,5 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_NTT_PARAMS_H +#define PQCLEAN_DILITHIUM3_AARCH64_NTT_PARAMS_H /* * We offer @@ -44,14 +44,14 @@ #define invomegaQ1 731434 // R = 2^32 below // RmodQ1 = 2^32 mod^{+-} Q1 -#define RmodQ1 -4186625 +#define RmodQ1 (-4186625) // Q1prime = Q1^{-1} mod^{+-} 2^32 #define Q1prime 58728449 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 8347681 // invNQ1R2modQ1 = -NTT_N^{-1} 2^32 2^32 mod^{+-} Q1 below -#define invNQ1R2modQ1 -41978 +#define invNQ1R2modQ1 (-41978) // invNQ1R2modQ1_prime = invNQ1R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 #define invNQ1R2modQ1_prime 8395782 // invNQ1R2modQ1_prime_half = (invNQ1R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 @@ -62,11 +62,11 @@ // invNQ1_final_R2modQ1 = -invNQ1R2modQ1 invomegaQ1^{128} mod q #define invNQ1_final_R2modQ1 4404704 // invNQ1_final_R2modQ1_prime = invNQ1_final_R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime -151046688 +#define invNQ1_final_R2modQ1_prime (-151046688) // invNQ1_final_R2modQ1_prime_half = (invNQ1_final_R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime_half -75523344 +#define invNQ1_final_R2modQ1_prime_half (-75523344) // invNQ1_final_R2modQ1_doubleprime = (invNQ1_final_R2modQ1_prime Q1 - invNQ1_final_R2modQ1) / 2^32 -#define invNQ1_final_R2modQ1_doubleprime -294725 +#define invNQ1_final_R2modQ1_doubleprime (-294725) // RmodQ1_prime = -(RmodQ1 + Q1) Q1prime mod^{+-} 2^32 #define RmodQ1_prime 512 diff --git a/crypto_sign/dilithium3/aarch64/api.h b/crypto_sign/dilithium3/aarch64/api.h index 6f11665e..46832a8e 100644 --- a/crypto_sign/dilithium3/aarch64/api.h +++ b/crypto_sign/dilithium3/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef API_H -#define API_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_API_H +#define PQCLEAN_DILITHIUM3_AARCH64_API_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/ntt.h b/crypto_sign/dilithium3/aarch64/ntt.h index ef8dd217..25b2d95d 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.h +++ b/crypto_sign/dilithium3/aarch64/ntt.h @@ -1,5 +1,5 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_NTT_H +#define PQCLEAN_DILITHIUM3_AARCH64_NTT_H /* * This file was originally licensed diff --git a/crypto_sign/dilithium3/aarch64/packing.h b/crypto_sign/dilithium3/aarch64/packing.h index c1fba82a..de6083ce 100644 --- a/crypto_sign/dilithium3/aarch64/packing.h +++ b/crypto_sign/dilithium3/aarch64/packing.h @@ -1,5 +1,5 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_PACKING_H +#define PQCLEAN_DILITHIUM3_AARCH64_PACKING_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/params.h b/crypto_sign/dilithium3/aarch64/params.h index fe19f815..646444d5 100644 --- a/crypto_sign/dilithium3/aarch64/params.h +++ b/crypto_sign/dilithium3/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_PARAMS_H +#define PQCLEAN_DILITHIUM3_AARCH64_PARAMS_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/poly.h b/crypto_sign/dilithium3/aarch64/poly.h index c253ecf6..cad1723e 100644 --- a/crypto_sign/dilithium3/aarch64/poly.h +++ b/crypto_sign/dilithium3/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_POLY_H +#define PQCLEAN_DILITHIUM3_AARCH64_POLY_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/polyvec.h b/crypto_sign/dilithium3/aarch64/polyvec.h index 8844ca79..fe78217d 100644 --- a/crypto_sign/dilithium3/aarch64/polyvec.h +++ b/crypto_sign/dilithium3/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_POLYVEC_H +#define PQCLEAN_DILITHIUM3_AARCH64_POLYVEC_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/reduce.h b/crypto_sign/dilithium3/aarch64/reduce.h index 9042e6cb..1abb92a4 100644 --- a/crypto_sign/dilithium3/aarch64/reduce.h +++ b/crypto_sign/dilithium3/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_REDUCE_H +#define PQCLEAN_DILITHIUM3_AARCH64_REDUCE_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/rounding.h b/crypto_sign/dilithium3/aarch64/rounding.h index 36167d2a..f142c0e3 100644 --- a/crypto_sign/dilithium3/aarch64/rounding.h +++ b/crypto_sign/dilithium3/aarch64/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_ROUNDING_H +#define PQCLEAN_DILITHIUM3_AARCH64_ROUNDING_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/sign.h b/crypto_sign/dilithium3/aarch64/sign.h index 05e7b5f6..97c60f3d 100644 --- a/crypto_sign/dilithium3/aarch64/sign.h +++ b/crypto_sign/dilithium3/aarch64/sign.h @@ -1,5 +1,5 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_SIGN_H +#define PQCLEAN_DILITHIUM3_AARCH64_SIGN_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium3/aarch64/symmetric.h b/crypto_sign/dilithium3/aarch64/symmetric.h index 74d21021..cf7ff128 100644 --- a/crypto_sign/dilithium3/aarch64/symmetric.h +++ b/crypto_sign/dilithium3/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_SYMMETRIC_H +#define PQCLEAN_DILITHIUM3_AARCH64_SYMMETRIC_H /* * This file was originally licensed diff --git a/crypto_sign/dilithium5/aarch64/NTT_params.h b/crypto_sign/dilithium5/aarch64/NTT_params.h index dc261a2d..b087f781 100644 --- a/crypto_sign/dilithium5/aarch64/NTT_params.h +++ b/crypto_sign/dilithium5/aarch64/NTT_params.h @@ -1,5 +1,5 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_NTT_PARAMS_H +#define PQCLEAN_DILITHIUM5_AARCH64_NTT_PARAMS_H /* * We offer @@ -44,14 +44,14 @@ #define invomegaQ1 731434 // R = 2^32 below // RmodQ1 = 2^32 mod^{+-} Q1 -#define RmodQ1 -4186625 +#define RmodQ1 (-4186625) // Q1prime = Q1^{-1} mod^{+-} 2^32 #define Q1prime 58728449 // invNQ1 = NTT_N^{-1} mod Q1 #define invNQ1 8347681 // invNQ1R2modQ1 = -NTT_N^{-1} 2^32 2^32 mod^{+-} Q1 below -#define invNQ1R2modQ1 -41978 +#define invNQ1R2modQ1 (-41978) // invNQ1R2modQ1_prime = invNQ1R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 #define invNQ1R2modQ1_prime 8395782 // invNQ1R2modQ1_prime_half = (invNQ1R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 @@ -62,11 +62,11 @@ // invNQ1_final_R2modQ1 = -invNQ1R2modQ1 invomegaQ1^{128} mod q #define invNQ1_final_R2modQ1 4404704 // invNQ1_final_R2modQ1_prime = invNQ1_final_R2modQ1 (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime -151046688 +#define invNQ1_final_R2modQ1_prime (-151046688) // invNQ1_final_R2modQ1_prime_half = (invNQ1_final_R2modQ1 / 2) (Q1^{-1} mod^{+-} 2^32) mod^{+-} 2^32 -#define invNQ1_final_R2modQ1_prime_half -75523344 +#define invNQ1_final_R2modQ1_prime_half (-75523344) // invNQ1_final_R2modQ1_doubleprime = (invNQ1_final_R2modQ1_prime Q1 - invNQ1_final_R2modQ1) / 2^32 -#define invNQ1_final_R2modQ1_doubleprime -294725 +#define invNQ1_final_R2modQ1_doubleprime (-294725) // RmodQ1_prime = -(RmodQ1 + Q1) Q1prime mod^{+-} 2^32 #define RmodQ1_prime 512 diff --git a/crypto_sign/dilithium5/aarch64/ntt.h b/crypto_sign/dilithium5/aarch64/ntt.h index 6797322b..d3c7b756 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.h +++ b/crypto_sign/dilithium5/aarch64/ntt.h @@ -1,5 +1,5 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_NTT_H +#define PQCLEAN_DILITHIUM5_AARCH64_NTT_H /* * This file was originally licensed diff --git a/crypto_sign/dilithium5/aarch64/packing.h b/crypto_sign/dilithium5/aarch64/packing.h index 050dc8e6..1d2c448a 100644 --- a/crypto_sign/dilithium5/aarch64/packing.h +++ b/crypto_sign/dilithium5/aarch64/packing.h @@ -1,5 +1,5 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_PACKING_H +#define PQCLEAN_DILITHIUM5_AARCH64_PACKING_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium5/aarch64/poly.h b/crypto_sign/dilithium5/aarch64/poly.h index c253ecf6..158e3e2d 100644 --- a/crypto_sign/dilithium5/aarch64/poly.h +++ b/crypto_sign/dilithium5/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_POLY_H +#define PQCLEAN_DILITHIUM5_AARCH64_POLY_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium5/aarch64/polyvec.h b/crypto_sign/dilithium5/aarch64/polyvec.h index 8844ca79..a3130785 100644 --- a/crypto_sign/dilithium5/aarch64/polyvec.h +++ b/crypto_sign/dilithium5/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_POLYVEC_H +#define PQCLEAN_DILITHIUM5_AARCH64_POLYVEC_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium5/aarch64/reduce.h b/crypto_sign/dilithium5/aarch64/reduce.h index 9042e6cb..2be7f6a7 100644 --- a/crypto_sign/dilithium5/aarch64/reduce.h +++ b/crypto_sign/dilithium5/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_REDUCE_H +#define PQCLEAN_DILITHIUM5_AARCH64_REDUCE_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium5/aarch64/rounding.h b/crypto_sign/dilithium5/aarch64/rounding.h index 36167d2a..1f6be28b 100644 --- a/crypto_sign/dilithium5/aarch64/rounding.h +++ b/crypto_sign/dilithium5/aarch64/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_ROUNDING_H +#define PQCLEAN_DILITHIUM5_AARCH64_ROUNDING_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium5/aarch64/sign.h b/crypto_sign/dilithium5/aarch64/sign.h index 05e7b5f6..692f06d3 100644 --- a/crypto_sign/dilithium5/aarch64/sign.h +++ b/crypto_sign/dilithium5/aarch64/sign.h @@ -1,5 +1,5 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_SIGN_H +#define PQCLEAN_DILITHIUM5_AARCH64_SIGN_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium5/aarch64/symmetric.h b/crypto_sign/dilithium5/aarch64/symmetric.h index 74d21021..ecf767f7 100644 --- a/crypto_sign/dilithium5/aarch64/symmetric.h +++ b/crypto_sign/dilithium5/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_SYMMETRIC_H +#define PQCLEAN_DILITHIUM5_AARCH64_SYMMETRIC_H /* * This file was originally licensed From ae10f4e2de654b297b9ccb58a2cc75bf9f495174 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 17:16:47 +0100 Subject: [PATCH 64/85] namespaced api --- crypto_kem/kyber1024/aarch64/api.h | 4 ++-- crypto_kem/kyber768/aarch64/api.h | 4 ++-- crypto_sign/dilithium2/aarch64/api.h | 4 ++-- crypto_sign/dilithium5/aarch64/api.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/api.h b/crypto_kem/kyber1024/aarch64/api.h index 217634f3..00373e01 100644 --- a/crypto_kem/kyber1024/aarch64/api.h +++ b/crypto_kem/kyber1024/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef API_H -#define API_H +#ifndef PQCLEAN_KYBER1024_AARCH64_API_H +#define PQCLEAN_KYBER1024_AARCH64_API_H /* * This file is licensed diff --git a/crypto_kem/kyber768/aarch64/api.h b/crypto_kem/kyber768/aarch64/api.h index 39b13746..eab10900 100644 --- a/crypto_kem/kyber768/aarch64/api.h +++ b/crypto_kem/kyber768/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef API_H -#define API_H +#ifndef PQCLEAN_KYBER768_AARCH64_API_H +#define PQCLEAN_KYBER768_AARCH64_API_H /* * This file is licensed diff --git a/crypto_sign/dilithium2/aarch64/api.h b/crypto_sign/dilithium2/aarch64/api.h index 254b49e1..77b1e37d 100644 --- a/crypto_sign/dilithium2/aarch64/api.h +++ b/crypto_sign/dilithium2/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef API_H -#define API_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_API_H +#define PQCLEAN_DILITHIUM2_AARCH64_API_H /* * This file is dual licensed diff --git a/crypto_sign/dilithium5/aarch64/api.h b/crypto_sign/dilithium5/aarch64/api.h index db2cc3ab..c211dc65 100644 --- a/crypto_sign/dilithium5/aarch64/api.h +++ b/crypto_sign/dilithium5/aarch64/api.h @@ -1,5 +1,5 @@ -#ifndef API_H -#define API_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_API_H +#define PQCLEAN_DILITHIUM5_AARCH64_API_H /* * This file is dual licensed From 6023c0b71560f3f0c8ebb516bce427aa0004536a Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 17:29:54 +0100 Subject: [PATCH 65/85] update custom unifdef --- crypto_kem/kyber1024/aarch64/cbd.c | 35 ------ crypto_kem/kyber1024/aarch64/indcpa.c | 153 +----------------------- crypto_kem/kyber1024/aarch64/kem.c | 10 +- crypto_kem/kyber1024/aarch64/kem.h | 6 - crypto_kem/kyber1024/aarch64/poly.c | 29 ----- crypto_kem/kyber512/aarch64/cbd.c | 10 -- crypto_kem/kyber512/aarch64/indcpa.c | 161 +------------------------- crypto_kem/kyber512/aarch64/kem.c | 10 +- crypto_kem/kyber512/aarch64/kem.h | 6 - crypto_kem/kyber512/aarch64/poly.c | 42 ------- crypto_kem/kyber768/aarch64/cbd.c | 35 ------ crypto_kem/kyber768/aarch64/indcpa.c | 90 +------------- crypto_kem/kyber768/aarch64/kem.c | 10 +- crypto_kem/kyber768/aarch64/kem.h | 6 - crypto_kem/kyber768/aarch64/poly.c | 42 ------- 15 files changed, 21 insertions(+), 624 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/cbd.c b/crypto_kem/kyber1024/aarch64/cbd.c index c0e9fe18..6ae95c03 100644 --- a/crypto_kem/kyber1024/aarch64/cbd.c +++ b/crypto_kem/kyber1024/aarch64/cbd.c @@ -127,15 +127,6 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) **************************************************/ -#if KYBER_ETA1 == 3 -static uint32_t load24_littleendian(const uint8_t x[3]) { - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} -#endif /************************************************* * Name: cbd3 @@ -148,35 +139,9 @@ static uint32_t load24_littleendian(const uint8_t x[3]) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -#if KYBER_ETA1 == 3 -static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { - unsigned int i, j; - uint32_t t, d; - int16_t a, b; - - for (i = 0; i < KYBER_N / 4; i++) { - t = load24_littleendian(buf + 3 * i); - d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) { - a = (d >> (6 * j + 0)) & 0x7; - b = (d >> (6 * j + 3)) & 0x7; - r[4 * i + j] = a - b; - } - } -} -#endif void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { - #if KYBER_ETA1 == 2 neon_cbd2(r, buf); - #elif KYBER_ETA1 == 3 - cbd3(r, buf); - #else -#error "This implementation requires eta1 in {2,3}" - #endif } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { diff --git a/crypto_kem/kyber1024/aarch64/indcpa.c b/crypto_kem/kyber1024/aarch64/indcpa.c index 93c0f9b7..04c4ae88 100644 --- a/crypto_kem/kyber1024/aarch64/indcpa.c +++ b/crypto_kem/kyber1024/aarch64/indcpa.c @@ -162,126 +162,6 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; neon_xof_state state; - #if KYBER_K == 2 - for (unsigned int i = 0; i < KYBER_K; i++) { - if (transposed) { - neon_xof_absorb(&state, seed, i, i, 0, 1); - } else { - neon_xof_absorb(&state, seed, 0, 1, i, i); - } - - neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); - - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - - ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0); - ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1); - while (ctr0 < KYBER_N || ctr1 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - buf1[k] = buf1[buflen - off + k]; - } - neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); - ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); - } - } - #elif KYBER_K == 3 - int16_t *s1 = NULL, *s2 = NULL; - unsigned int x1, x2, y1, y2; - xof_state c_state; - - for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2) { - switch (j) { - case 0: - s1 = &(a[0][0][0]); - s2 = &(a[0][1][0]); - x1 = 0; - y1 = 0; - x2 = 0; - y2 = 1; - break; - case 2: - s1 = &(a[0][2][0]); - s2 = &(a[1][0][0]); - x1 = 0; - y1 = 2; - x2 = 1; - y2 = 0; - break; - case 4: - s1 = &(a[1][1][0]); - s2 = &(a[1][2][0]); - x1 = 1; - y1 = 1; - x2 = 1; - y2 = 2; - break; - default: - s1 = &(a[2][0][0]); - s2 = &(a[2][1][0]); - x1 = 2; - y1 = 0; - x2 = 2; - y2 = 1; - break; - } - - if (transposed) { - neon_xof_absorb(&state, seed, x1, x2, y1, y2); - } else { - neon_xof_absorb(&state, seed, y1, y2, x1, x2); - } - - neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); - - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - - ctr0 = neon_rej_uniform(s1, buf0); - ctr1 = neon_rej_uniform(s2, buf1); - - while (ctr0 < KYBER_N || ctr1 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - buf1[k] = buf1[buflen - off + k]; - } - neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen); - ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen); - } - } - - // Last iteration [2][2] - if (transposed) { - xof_absorb(&c_state, seed, 2, 2); - } else { - xof_absorb(&c_state, seed, 2, 2); - } - - xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state); - - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - - ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0); - - while (ctr0 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - } - xof_squeezeblocks(buf0 + off, 1, &c_state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); - } - - #elif KYBER_K == 4 for (unsigned int i = 0; i < KYBER_K; i++) { for (unsigned int j = 0; j < KYBER_K; j += 2) { if (transposed) { @@ -309,9 +189,6 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S } } } - #else -#error "KYBER_K must be in {2,3,4}" - #endif } /************************************************* @@ -326,8 +203,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]) { + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; uint8_t buf[2 * KYBER_SYMBYTES]; const uint8_t *publicseed = buf; @@ -342,19 +219,10 @@ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], gen_a(a, publicseed); - #if KYBER_K == 2 - neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); - neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3); - #elif KYBER_K == 3 - neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); - neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3); - neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5); - #elif KYBER_K == 4 neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3); neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5); neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7); - #endif neon_polyvec_ntt(skpv); neon_polyvec_ntt(e); @@ -410,22 +278,6 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); - #if KYBER_K == 2 - // ETA1 != ETA2 (3 != 2) - neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); - neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3); - neon_poly_getnoise_eta2(&(epp[0]), coins, 4); - #elif KYBER_K == 3 - #if KYBER_ETA1 == KYBER_ETA2 - // Because ETA1 == ETA2 - neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); - neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3); - neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5); - neon_poly_getnoise_eta2(&(epp[0]), coins, 6); - #else -#error "We need eta1 == eta2 here" - #endif - #elif KYBER_K == 4 #if KYBER_ETA1 == KYBER_ETA2 neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3); @@ -435,7 +287,6 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], #else #error "We need eta1 == eta2 here" #endif - #endif neon_polyvec_ntt(sp); diff --git a/crypto_kem/kyber1024/aarch64/kem.c b/crypto_kem/kyber1024/aarch64/kem.c index a71d5ac6..d694befa 100644 --- a/crypto_kem/kyber1024/aarch64/kem.c +++ b/crypto_kem/kyber1024/aarch64/kem.c @@ -34,8 +34,8 @@ * Returns 0 (success) **************************************************/ int crypto_kem_keypair_derand(uint8_t *pk, - uint8_t *sk, - const uint8_t *coins) { + uint8_t *sk, + const uint8_t *coins) { indcpa_keypair_derand(pk, sk, coins); memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES); hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); @@ -85,9 +85,9 @@ int crypto_kem_keypair(uint8_t *pk, * Returns 0 (success) **************************************************/ int crypto_kem_enc_derand(uint8_t *ct, - uint8_t *ss, - const uint8_t *pk, - const uint8_t *coins) { + uint8_t *ss, + const uint8_t *pk, + const uint8_t *coins) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber1024/aarch64/kem.h b/crypto_kem/kyber1024/aarch64/kem.h index d542de09..b674a77f 100644 --- a/crypto_kem/kyber1024/aarch64/kem.h +++ b/crypto_kem/kyber1024/aarch64/kem.h @@ -10,13 +10,7 @@ #include #include "params.h" -#if (KYBER_K == 2) -#define CRYPTO_ALGNAME "Kyber512" -#elif (KYBER_K == 3) -#define CRYPTO_ALGNAME "Kyber768" -#elif (KYBER_K == 4) #define CRYPTO_ALGNAME "Kyber1024" -#endif #define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand) int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins); diff --git a/crypto_kem/kyber1024/aarch64/poly.c b/crypto_kem/kyber1024/aarch64/poly.c index 9e7abbd0..6a91179d 100644 --- a/crypto_kem/kyber1024/aarch64/poly.c +++ b/crypto_kem/kyber1024/aarch64/poly.c @@ -54,22 +54,6 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N int16_t u; uint8_t t[8]; - #if (KYBER_POLYCOMPRESSEDBYTES == 128) - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - // map to positive standard representatives - u = a[8 * i + j]; - u += (u >> 15) & KYBER_Q; - t[j] = ((((uint16_t)u << 4) + KYBER_Q / 2) / KYBER_Q) & 15; - } - - r[0] = t[0] | (t[1] << 4); - r[1] = t[2] | (t[3] << 4); - r[2] = t[4] | (t[5] << 4); - r[3] = t[6] | (t[7] << 4); - r += 4; - } - #elif (KYBER_POLYCOMPRESSEDBYTES == 160) for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { // map to positive standard representatives @@ -85,9 +69,6 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N r[4] = (t[6] >> 2) | (t[7] << 3); r += 5; } - #else -#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" - #endif } /************************************************* @@ -103,13 +84,6 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { unsigned int i; - #if (KYBER_POLYCOMPRESSEDBYTES == 128) - for (i = 0; i < KYBER_N / 2; i++) { - r[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; - r[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; - a += 1; - } - #elif (KYBER_POLYCOMPRESSEDBYTES == 160) unsigned int j; uint8_t t[8]; for (i = 0; i < KYBER_N / 8; i++) { @@ -127,9 +101,6 @@ void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYT r[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; } } - #else -#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" - #endif } /************************************************* diff --git a/crypto_kem/kyber512/aarch64/cbd.c b/crypto_kem/kyber512/aarch64/cbd.c index c0e9fe18..c26fd7fd 100644 --- a/crypto_kem/kyber512/aarch64/cbd.c +++ b/crypto_kem/kyber512/aarch64/cbd.c @@ -127,7 +127,6 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) **************************************************/ -#if KYBER_ETA1 == 3 static uint32_t load24_littleendian(const uint8_t x[3]) { uint32_t r; r = (uint32_t)x[0]; @@ -135,7 +134,6 @@ static uint32_t load24_littleendian(const uint8_t x[3]) { r |= (uint32_t)x[2] << 16; return r; } -#endif /************************************************* * Name: cbd3 @@ -148,7 +146,6 @@ static uint32_t load24_littleendian(const uint8_t x[3]) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -#if KYBER_ETA1 == 3 static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { unsigned int i, j; uint32_t t, d; @@ -167,16 +164,9 @@ static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { } } } -#endif void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { - #if KYBER_ETA1 == 2 - neon_cbd2(r, buf); - #elif KYBER_ETA1 == 3 cbd3(r, buf); - #else -#error "This implementation requires eta1 in {2,3}" - #endif } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { diff --git a/crypto_kem/kyber512/aarch64/indcpa.c b/crypto_kem/kyber512/aarch64/indcpa.c index 0bca9a64..07b2c071 100644 --- a/crypto_kem/kyber512/aarch64/indcpa.c +++ b/crypto_kem/kyber512/aarch64/indcpa.c @@ -162,7 +162,6 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; neon_xof_state state; - #if KYBER_K == 2 for (unsigned int i = 0; i < KYBER_K; i++) { if (transposed) { neon_xof_absorb(&state, seed, i, i, 0, 1); @@ -189,129 +188,6 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); } } - #elif KYBER_K == 3 - int16_t *s1 = NULL, *s2 = NULL; - unsigned int x1, x2, y1, y2; - xof_state c_state; - - for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2) { - switch (j) { - case 0: - s1 = &(a[0][0][0]); - s2 = &(a[0][1][0]); - x1 = 0; - y1 = 0; - x2 = 0; - y2 = 1; - break; - case 2: - s1 = &(a[0][2][0]); - s2 = &(a[1][0][0]); - x1 = 0; - y1 = 2; - x2 = 1; - y2 = 0; - break; - case 4: - s1 = &(a[1][1][0]); - s2 = &(a[1][2][0]); - x1 = 1; - y1 = 1; - x2 = 1; - y2 = 2; - break; - default: - s1 = &(a[2][0][0]); - s2 = &(a[2][1][0]); - x1 = 2; - y1 = 0; - x2 = 2; - y2 = 1; - break; - } - - if (transposed) { - neon_xof_absorb(&state, seed, x1, x2, y1, y2); - } else { - neon_xof_absorb(&state, seed, y1, y2, x1, x2); - } - - neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); - - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - - ctr0 = neon_rej_uniform(s1, buf0); - ctr1 = neon_rej_uniform(s2, buf1); - - while (ctr0 < KYBER_N || ctr1 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - buf1[k] = buf1[buflen - off + k]; - } - neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen); - ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen); - } - } - - // Last iteration [2][2] - if (transposed) { - xof_absorb(&c_state, seed, 2, 2); - } else { - xof_absorb(&c_state, seed, 2, 2); - } - - xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state); - - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - - ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0); - - while (ctr0 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - } - xof_squeezeblocks(buf0 + off, 1, &c_state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); - } - - #elif KYBER_K == 4 - for (unsigned int i = 0; i < KYBER_K; i++) { - for (unsigned int j = 0; j < KYBER_K; j += 2) { - if (transposed) { - neon_xof_absorb(&state, seed, i, i, j, j + 1); - } else { - neon_xof_absorb(&state, seed, j, j + 1, i, i); - } - - neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0); - ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1); - - while (ctr0 < KYBER_N || ctr1 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - buf1[k] = buf1[buflen - off + k]; - } - neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); - ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); - } - } - } - #else -#error "KYBER_K must be in {2,3,4}" - #endif } /************************************************* @@ -326,8 +202,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]) { + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; uint8_t buf[2 * KYBER_SYMBYTES]; const uint8_t *publicseed = buf; @@ -342,19 +218,8 @@ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], gen_a(a, publicseed); - #if KYBER_K == 2 neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3); - #elif KYBER_K == 3 - neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); - neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3); - neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5); - #elif KYBER_K == 4 - neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); - neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3); - neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5); - neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7); - #endif neon_polyvec_ntt(skpv); neon_polyvec_ntt(e); @@ -410,32 +275,10 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); - #if KYBER_K == 2 // ETA1 != ETA2 (3 != 2) neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3); neon_poly_getnoise_eta2(&(epp[0]), coins, 4); - #elif KYBER_K == 3 - #if KYBER_ETA1 == KYBER_ETA2 - // Because ETA1 == ETA2 - neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); - neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3); - neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5); - neon_poly_getnoise_eta2(&(epp[0]), coins, 6); - #else -#error "We need eta1 == eta2 here" - #endif - #elif KYBER_K == 4 - #if KYBER_ETA1 == KYBER_ETA2 - neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); - neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3); - neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5); - neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7); - neon_poly_getnoise_eta2(&(epp[0]), coins, 8); - #else -#error "We need eta1 == eta2 here" - #endif - #endif neon_polyvec_ntt(sp); diff --git a/crypto_kem/kyber512/aarch64/kem.c b/crypto_kem/kyber512/aarch64/kem.c index a71d5ac6..d694befa 100644 --- a/crypto_kem/kyber512/aarch64/kem.c +++ b/crypto_kem/kyber512/aarch64/kem.c @@ -34,8 +34,8 @@ * Returns 0 (success) **************************************************/ int crypto_kem_keypair_derand(uint8_t *pk, - uint8_t *sk, - const uint8_t *coins) { + uint8_t *sk, + const uint8_t *coins) { indcpa_keypair_derand(pk, sk, coins); memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES); hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); @@ -85,9 +85,9 @@ int crypto_kem_keypair(uint8_t *pk, * Returns 0 (success) **************************************************/ int crypto_kem_enc_derand(uint8_t *ct, - uint8_t *ss, - const uint8_t *pk, - const uint8_t *coins) { + uint8_t *ss, + const uint8_t *pk, + const uint8_t *coins) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber512/aarch64/kem.h b/crypto_kem/kyber512/aarch64/kem.h index 8702ac47..28ad7098 100644 --- a/crypto_kem/kyber512/aarch64/kem.h +++ b/crypto_kem/kyber512/aarch64/kem.h @@ -10,13 +10,7 @@ #include #include "params.h" -#if (KYBER_K == 2) #define CRYPTO_ALGNAME "Kyber512" -#elif (KYBER_K == 3) -#define CRYPTO_ALGNAME "Kyber768" -#elif (KYBER_K == 4) -#define CRYPTO_ALGNAME "Kyber1024" -#endif #define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand) int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins); diff --git a/crypto_kem/kyber512/aarch64/poly.c b/crypto_kem/kyber512/aarch64/poly.c index 9e7abbd0..be4ee061 100644 --- a/crypto_kem/kyber512/aarch64/poly.c +++ b/crypto_kem/kyber512/aarch64/poly.c @@ -54,7 +54,6 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N int16_t u; uint8_t t[8]; - #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { // map to positive standard representatives @@ -69,25 +68,6 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N r[3] = t[6] | (t[7] << 4); r += 4; } - #elif (KYBER_POLYCOMPRESSEDBYTES == 160) - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - // map to positive standard representatives - u = a[8 * i + j]; - u += (u >> 15) & KYBER_Q; - t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; - } - - r[0] = (t[0] >> 0) | (t[1] << 5); - r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); - r[2] = (t[3] >> 1) | (t[4] << 4); - r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); - r[4] = (t[6] >> 2) | (t[7] << 3); - r += 5; - } - #else -#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" - #endif } /************************************************* @@ -103,33 +83,11 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { unsigned int i; - #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 2; i++) { r[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; r[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; a += 1; } - #elif (KYBER_POLYCOMPRESSEDBYTES == 160) - unsigned int j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 5) | (a[1] << 3); - t[2] = (a[1] >> 2); - t[3] = (a[1] >> 7) | (a[2] << 1); - t[4] = (a[2] >> 4) | (a[3] << 4); - t[5] = (a[3] >> 1); - t[6] = (a[3] >> 6) | (a[4] << 2); - t[7] = (a[4] >> 3); - a += 5; - - for (j = 0; j < 8; j++) { - r[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; - } - } - #else -#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" - #endif } /************************************************* diff --git a/crypto_kem/kyber768/aarch64/cbd.c b/crypto_kem/kyber768/aarch64/cbd.c index c0e9fe18..6ae95c03 100644 --- a/crypto_kem/kyber768/aarch64/cbd.c +++ b/crypto_kem/kyber768/aarch64/cbd.c @@ -127,15 +127,6 @@ void neon_cbd2(int16_t *r, const uint8_t buf[2 * KYBER_N / 4]) { * * Returns 32-bit unsigned integer loaded from x (most significant byte is zero) **************************************************/ -#if KYBER_ETA1 == 3 -static uint32_t load24_littleendian(const uint8_t x[3]) { - uint32_t r; - r = (uint32_t)x[0]; - r |= (uint32_t)x[1] << 8; - r |= (uint32_t)x[2] << 16; - return r; -} -#endif /************************************************* * Name: cbd3 @@ -148,35 +139,9 @@ static uint32_t load24_littleendian(const uint8_t x[3]) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -#if KYBER_ETA1 == 3 -static void cbd3(int16_t *r, const uint8_t buf[3 * KYBER_N / 4]) { - unsigned int i, j; - uint32_t t, d; - int16_t a, b; - - for (i = 0; i < KYBER_N / 4; i++) { - t = load24_littleendian(buf + 3 * i); - d = t & 0x00249249; - d += (t >> 1) & 0x00249249; - d += (t >> 2) & 0x00249249; - - for (j = 0; j < 4; j++) { - a = (d >> (6 * j + 0)) & 0x7; - b = (d >> (6 * j + 3)) & 0x7; - r[4 * i + j] = a - b; - } - } -} -#endif void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) { - #if KYBER_ETA1 == 2 neon_cbd2(r, buf); - #elif KYBER_ETA1 == 3 - cbd3(r, buf); - #else -#error "This implementation requires eta1 in {2,3}" - #endif } void poly_cbd_eta2(int16_t *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) { diff --git a/crypto_kem/kyber768/aarch64/indcpa.c b/crypto_kem/kyber768/aarch64/indcpa.c index d9c60ffa..2c383997 100644 --- a/crypto_kem/kyber768/aarch64/indcpa.c +++ b/crypto_kem/kyber768/aarch64/indcpa.c @@ -162,34 +162,6 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2]; neon_xof_state state; - #if KYBER_K == 2 - for (unsigned int i = 0; i < KYBER_K; i++) { - if (transposed) { - neon_xof_absorb(&state, seed, i, i, 0, 1); - } else { - neon_xof_absorb(&state, seed, 0, 1, i, i); - } - - neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); - - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - - ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0); - ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1); - while (ctr0 < KYBER_N || ctr1 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - buf1[k] = buf1[buflen - off + k]; - } - neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); - ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); - } - } - #elif KYBER_K == 3 int16_t *s1 = NULL, *s2 = NULL; unsigned int x1, x2, y1, y2; xof_state c_state; @@ -281,37 +253,6 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); } - #elif KYBER_K == 4 - for (unsigned int i = 0; i < KYBER_K; i++) { - for (unsigned int j = 0; j < KYBER_K; j += 2) { - if (transposed) { - neon_xof_absorb(&state, seed, i, i, j, j + 1); - } else { - neon_xof_absorb(&state, seed, j, j + 1, i, i); - } - - neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state); - buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES; - ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0); - ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1); - - while (ctr0 < KYBER_N || ctr1 < KYBER_N) { - off = buflen % 3; - for (k = 0; k < off; k++) { - buf0[k] = buf0[buflen - off + k]; - buf1[k] = buf1[buflen - off + k]; - } - neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state); - - buflen = off + XOF_BLOCKBYTES; - ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); - ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen); - } - } - } - #else -#error "KYBER_K must be in {2,3,4}" - #endif } /************************************************* @@ -326,8 +267,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]) { + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { unsigned int i; uint8_t buf[2 * KYBER_SYMBYTES]; const uint8_t *publicseed = buf; @@ -342,19 +283,9 @@ void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], gen_a(a, publicseed); - #if KYBER_K == 2 - neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); - neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3); - #elif KYBER_K == 3 neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3); neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5); - #elif KYBER_K == 4 - neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1); - neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3); - neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5); - neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7); - #endif neon_polyvec_ntt(skpv); neon_polyvec_ntt(e); @@ -410,12 +341,6 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); - #if KYBER_K == 2 - // ETA1 != ETA2 (3 != 2) - neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); - neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3); - neon_poly_getnoise_eta2(&(epp[0]), coins, 4); - #elif KYBER_K == 3 #if KYBER_ETA1 == KYBER_ETA2 // Because ETA1 == ETA2 neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); @@ -425,17 +350,6 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], #else #error "We need eta1 == eta2 here" #endif - #elif KYBER_K == 4 - #if KYBER_ETA1 == KYBER_ETA2 - neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); - neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3); - neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5); - neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7); - neon_poly_getnoise_eta2(&(epp[0]), coins, 8); - #else -#error "We need eta1 == eta2 here" - #endif - #endif neon_polyvec_ntt(sp); diff --git a/crypto_kem/kyber768/aarch64/kem.c b/crypto_kem/kyber768/aarch64/kem.c index a71d5ac6..d694befa 100644 --- a/crypto_kem/kyber768/aarch64/kem.c +++ b/crypto_kem/kyber768/aarch64/kem.c @@ -34,8 +34,8 @@ * Returns 0 (success) **************************************************/ int crypto_kem_keypair_derand(uint8_t *pk, - uint8_t *sk, - const uint8_t *coins) { + uint8_t *sk, + const uint8_t *coins) { indcpa_keypair_derand(pk, sk, coins); memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES); hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); @@ -85,9 +85,9 @@ int crypto_kem_keypair(uint8_t *pk, * Returns 0 (success) **************************************************/ int crypto_kem_enc_derand(uint8_t *ct, - uint8_t *ss, - const uint8_t *pk, - const uint8_t *coins) { + uint8_t *ss, + const uint8_t *pk, + const uint8_t *coins) { uint8_t buf[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ uint8_t kr[2 * KYBER_SYMBYTES]; diff --git a/crypto_kem/kyber768/aarch64/kem.h b/crypto_kem/kyber768/aarch64/kem.h index fb8a731d..ce50e37c 100644 --- a/crypto_kem/kyber768/aarch64/kem.h +++ b/crypto_kem/kyber768/aarch64/kem.h @@ -10,13 +10,7 @@ #include #include "params.h" -#if (KYBER_K == 2) -#define CRYPTO_ALGNAME "Kyber512" -#elif (KYBER_K == 3) #define CRYPTO_ALGNAME "Kyber768" -#elif (KYBER_K == 4) -#define CRYPTO_ALGNAME "Kyber1024" -#endif #define crypto_kem_keypair_derand KYBER_NAMESPACE(keypair_derand) int crypto_kem_keypair_derand(uint8_t *pk, uint8_t *sk, const uint8_t *coins); diff --git a/crypto_kem/kyber768/aarch64/poly.c b/crypto_kem/kyber768/aarch64/poly.c index 9e7abbd0..be4ee061 100644 --- a/crypto_kem/kyber768/aarch64/poly.c +++ b/crypto_kem/kyber768/aarch64/poly.c @@ -54,7 +54,6 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N int16_t u; uint8_t t[8]; - #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { // map to positive standard representatives @@ -69,25 +68,6 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N r[3] = t[6] | (t[7] << 4); r += 4; } - #elif (KYBER_POLYCOMPRESSEDBYTES == 160) - for (i = 0; i < KYBER_N / 8; i++) { - for (j = 0; j < 8; j++) { - // map to positive standard representatives - u = a[8 * i + j]; - u += (u >> 15) & KYBER_Q; - t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31; - } - - r[0] = (t[0] >> 0) | (t[1] << 5); - r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); - r[2] = (t[3] >> 1) | (t[4] << 4); - r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); - r[4] = (t[6] >> 2) | (t[7] << 3); - r += 5; - } - #else -#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" - #endif } /************************************************* @@ -103,33 +83,11 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const int16_t a[KYBER_N void poly_decompress(int16_t r[KYBER_N], const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { unsigned int i; - #if (KYBER_POLYCOMPRESSEDBYTES == 128) for (i = 0; i < KYBER_N / 2; i++) { r[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; r[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; a += 1; } - #elif (KYBER_POLYCOMPRESSEDBYTES == 160) - unsigned int j; - uint8_t t[8]; - for (i = 0; i < KYBER_N / 8; i++) { - t[0] = (a[0] >> 0); - t[1] = (a[0] >> 5) | (a[1] << 3); - t[2] = (a[1] >> 2); - t[3] = (a[1] >> 7) | (a[2] << 1); - t[4] = (a[2] >> 4) | (a[3] << 4); - t[5] = (a[3] >> 1); - t[6] = (a[3] >> 6) | (a[4] << 2); - t[7] = (a[4] >> 3); - a += 5; - - for (j = 0; j < 8; j++) { - r[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; - } - } - #else -#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" - #endif } /************************************************* From 0a814b9484bd51f8fba4c687e662a50f78019756 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 17:39:19 +0100 Subject: [PATCH 66/85] more tweak for unifdef --- crypto_kem/kyber1024/aarch64/indcpa.h | 4 ++-- crypto_kem/kyber512/aarch64/indcpa.h | 4 ++-- crypto_kem/kyber768/aarch64/indcpa.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/indcpa.h b/crypto_kem/kyber1024/aarch64/indcpa.h index 2e4b46d4..6357746c 100644 --- a/crypto_kem/kyber1024/aarch64/indcpa.h +++ b/crypto_kem/kyber1024/aarch64/indcpa.h @@ -15,8 +15,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); #define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand) void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]); + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]); #define indcpa_enc KYBER_NAMESPACE(indcpa_enc) void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], diff --git a/crypto_kem/kyber512/aarch64/indcpa.h b/crypto_kem/kyber512/aarch64/indcpa.h index 0fdac5e2..25ae3a1a 100644 --- a/crypto_kem/kyber512/aarch64/indcpa.h +++ b/crypto_kem/kyber512/aarch64/indcpa.h @@ -15,8 +15,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); #define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand) void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]); + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]); #define indcpa_enc KYBER_NAMESPACE(indcpa_enc) void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], diff --git a/crypto_kem/kyber768/aarch64/indcpa.h b/crypto_kem/kyber768/aarch64/indcpa.h index 313888c0..ebcd26c2 100644 --- a/crypto_kem/kyber768/aarch64/indcpa.h +++ b/crypto_kem/kyber768/aarch64/indcpa.h @@ -15,8 +15,8 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); #define indcpa_keypair_derand KYBER_NAMESPACE(indcpa_keypair_derand) void indcpa_keypair_derand(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], - uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], - const uint8_t coins[KYBER_SYMBYTES]); + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]); #define indcpa_enc KYBER_NAMESPACE(indcpa_enc) void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], From 78946a1a48af71f7679cc884e8cf92a72e4f30fd Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 17:50:04 +0100 Subject: [PATCH 67/85] a more complete unifdef --- crypto_kem/kyber1024/aarch64/indcpa.c | 4 ---- crypto_kem/kyber768/aarch64/indcpa.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/indcpa.c b/crypto_kem/kyber1024/aarch64/indcpa.c index 04c4ae88..5b8c5f3a 100644 --- a/crypto_kem/kyber1024/aarch64/indcpa.c +++ b/crypto_kem/kyber1024/aarch64/indcpa.c @@ -278,15 +278,11 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); - #if KYBER_ETA1 == KYBER_ETA2 neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3); neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5); neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7); neon_poly_getnoise_eta2(&(epp[0]), coins, 8); - #else -#error "We need eta1 == eta2 here" - #endif neon_polyvec_ntt(sp); diff --git a/crypto_kem/kyber768/aarch64/indcpa.c b/crypto_kem/kyber768/aarch64/indcpa.c index 2c383997..fef47388 100644 --- a/crypto_kem/kyber768/aarch64/indcpa.c +++ b/crypto_kem/kyber768/aarch64/indcpa.c @@ -341,15 +341,11 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], poly_frommsg(k, m); gen_at(at, seed); - #if KYBER_ETA1 == KYBER_ETA2 // Because ETA1 == ETA2 neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1); neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3); neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5); neon_poly_getnoise_eta2(&(epp[0]), coins, 6); - #else -#error "We need eta1 == eta2 here" - #endif neon_polyvec_ntt(sp); From d59c3bc3f254fefd1346468d3f5634a012a859d8 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 18:22:46 +0100 Subject: [PATCH 68/85] tweak preprocessing --- crypto_kem/kyber1024/aarch64/polyvec.c | 41 --------------------- crypto_kem/kyber512/aarch64/polyvec.c | 51 -------------------------- crypto_kem/kyber768/aarch64/polyvec.c | 51 -------------------------- 3 files changed, 143 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/polyvec.c b/crypto_kem/kyber1024/aarch64/polyvec.c index 8907c316..f382543a 100644 --- a/crypto_kem/kyber1024/aarch64/polyvec.c +++ b/crypto_kem/kyber1024/aarch64/polyvec.c @@ -22,7 +22,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; - #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 8; j++) { @@ -46,27 +45,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[K r += 11; } } - #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) - uint16_t t[4]; - for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - for (k = 0; k < 4; k++) { - t[k] = a[i][4 * j + k]; - t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; - t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; - } - - r[0] = (t[0] >> 0); - r[1] = (t[0] >> 8) | (t[1] << 2); - r[2] = (t[1] >> 6) | (t[2] << 4); - r[3] = (t[2] >> 4) | (t[3] << 6); - r[4] = (t[3] >> 2); - r += 5; - } - } - #else -#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" - #endif } /************************************************* @@ -82,7 +60,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[K void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; - #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) uint16_t t[8]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 8; j++) { @@ -101,24 +78,6 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV } } } - #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) - uint16_t t[4]; - for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 4; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); - t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); - t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); - t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); - a += 5; - - for (k = 0; k < 4; k++) { - r[i][4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; - } - } - } - #else -#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" - #endif } /************************************************* diff --git a/crypto_kem/kyber512/aarch64/polyvec.c b/crypto_kem/kyber512/aarch64/polyvec.c index 8907c316..8930c956 100644 --- a/crypto_kem/kyber512/aarch64/polyvec.c +++ b/crypto_kem/kyber512/aarch64/polyvec.c @@ -22,31 +22,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; - #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) - uint16_t t[8]; - for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - for (k = 0; k < 8; k++) { - t[k] = a[i][8 * j + k]; - t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; - t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; - } - - r[ 0] = (t[0] >> 0); - r[ 1] = (t[0] >> 8) | (t[1] << 3); - r[ 2] = (t[1] >> 5) | (t[2] << 6); - r[ 3] = (t[2] >> 2); - r[ 4] = (t[2] >> 10) | (t[3] << 1); - r[ 5] = (t[3] >> 7) | (t[4] << 4); - r[ 6] = (t[4] >> 4) | (t[5] << 7); - r[ 7] = (t[5] >> 1); - r[ 8] = (t[5] >> 9) | (t[6] << 2); - r[ 9] = (t[6] >> 6) | (t[7] << 5); - r[10] = (t[7] >> 3); - r += 11; - } - } - #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -64,9 +39,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[K r += 5; } } - #else -#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" - #endif } /************************************************* @@ -82,26 +54,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[K void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; - #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) - uint16_t t[8]; - for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); - t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); - t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); - t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); - t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); - t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); - t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); - t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); - a += 11; - - for (k = 0; k < 8; k++) { - r[i][8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; - } - } - } - #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -116,9 +68,6 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV } } } - #else -#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" - #endif } /************************************************* diff --git a/crypto_kem/kyber768/aarch64/polyvec.c b/crypto_kem/kyber768/aarch64/polyvec.c index 8907c316..8930c956 100644 --- a/crypto_kem/kyber768/aarch64/polyvec.c +++ b/crypto_kem/kyber768/aarch64/polyvec.c @@ -22,31 +22,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; - #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) - uint16_t t[8]; - for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - for (k = 0; k < 8; k++) { - t[k] = a[i][8 * j + k]; - t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; - t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; - } - - r[ 0] = (t[0] >> 0); - r[ 1] = (t[0] >> 8) | (t[1] << 3); - r[ 2] = (t[1] >> 5) | (t[2] << 6); - r[ 3] = (t[2] >> 2); - r[ 4] = (t[2] >> 10) | (t[3] << 1); - r[ 5] = (t[3] >> 7) | (t[4] << 4); - r[ 6] = (t[4] >> 4) | (t[5] << 7); - r[ 7] = (t[5] >> 1); - r[ 8] = (t[5] >> 9) | (t[6] << 2); - r[ 9] = (t[6] >> 6) | (t[7] << 5); - r[10] = (t[7] >> 3); - r += 11; - } - } - #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -64,9 +39,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[K r += 5; } } - #else -#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" - #endif } /************************************************* @@ -82,26 +54,6 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[K void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { unsigned int i, j, k; - #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) - uint16_t t[8]; - for (i = 0; i < KYBER_K; i++) { - for (j = 0; j < KYBER_N / 8; j++) { - t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); - t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); - t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); - t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); - t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); - t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); - t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); - t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); - a += 11; - - for (k = 0; k < 8; k++) { - r[i][8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; - } - } - } - #elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) uint16_t t[4]; for (i = 0; i < KYBER_K; i++) { for (j = 0; j < KYBER_N / 4; j++) { @@ -116,9 +68,6 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV } } } - #else -#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" - #endif } /************************************************* From e83d1e32d822fdab40b504997188c96cabaefa47 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 18:26:33 +0100 Subject: [PATCH 69/85] a more completed unifdef --- crypto_kem/kyber1024/aarch64/poly.c | 4 -- crypto_kem/kyber1024/aarch64/reduce.h | 4 +- crypto_kem/kyber512/aarch64/poly.c | 4 -- crypto_kem/kyber512/aarch64/reduce.h | 4 +- crypto_kem/kyber768/aarch64/poly.c | 4 -- crypto_kem/kyber768/aarch64/reduce.h | 4 +- crypto_sign/dilithium2/aarch64/params.h | 4 -- crypto_sign/dilithium2/aarch64/poly.c | 48 ----------------- crypto_sign/dilithium3/aarch64/params.h | 4 -- crypto_sign/dilithium3/aarch64/poly.c | 72 ------------------------- crypto_sign/dilithium5/aarch64/params.h | 4 -- crypto_sign/dilithium5/aarch64/poly.c | 48 ----------------- 12 files changed, 6 insertions(+), 198 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/poly.c b/crypto_kem/kyber1024/aarch64/poly.c index 6a91179d..0c26205d 100644 --- a/crypto_kem/kyber1024/aarch64/poly.c +++ b/crypto_kem/kyber1024/aarch64/poly.c @@ -186,10 +186,6 @@ void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]) unsigned int i, j; int16_t mask; - #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) -#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" - #endif - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { mask = -(int16_t)((msg[i] >> j) & 1); diff --git a/crypto_kem/kyber1024/aarch64/reduce.h b/crypto_kem/kyber1024/aarch64/reduce.h index ee362244..1cb8265b 100644 --- a/crypto_kem/kyber1024/aarch64/reduce.h +++ b/crypto_kem/kyber1024/aarch64/reduce.h @@ -10,8 +10,8 @@ #include #include "params.h" -#define MONT -1044 // 2^16 mod q -#define QINV -3327 // q^-1 mod 2^16 +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 #define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce) int16_t montgomery_reduce(int32_t a); diff --git a/crypto_kem/kyber512/aarch64/poly.c b/crypto_kem/kyber512/aarch64/poly.c index be4ee061..3cb9ecc4 100644 --- a/crypto_kem/kyber512/aarch64/poly.c +++ b/crypto_kem/kyber512/aarch64/poly.c @@ -173,10 +173,6 @@ void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]) unsigned int i, j; int16_t mask; - #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) -#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" - #endif - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { mask = -(int16_t)((msg[i] >> j) & 1); diff --git a/crypto_kem/kyber512/aarch64/reduce.h b/crypto_kem/kyber512/aarch64/reduce.h index c093e84a..4266c7d7 100644 --- a/crypto_kem/kyber512/aarch64/reduce.h +++ b/crypto_kem/kyber512/aarch64/reduce.h @@ -10,8 +10,8 @@ #include #include "params.h" -#define MONT -1044 // 2^16 mod q -#define QINV -3327 // q^-1 mod 2^16 +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 #define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce) int16_t montgomery_reduce(int32_t a); diff --git a/crypto_kem/kyber768/aarch64/poly.c b/crypto_kem/kyber768/aarch64/poly.c index be4ee061..3cb9ecc4 100644 --- a/crypto_kem/kyber768/aarch64/poly.c +++ b/crypto_kem/kyber768/aarch64/poly.c @@ -173,10 +173,6 @@ void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]) unsigned int i, j; int16_t mask; - #if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) -#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" - #endif - for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { mask = -(int16_t)((msg[i] >> j) & 1); diff --git a/crypto_kem/kyber768/aarch64/reduce.h b/crypto_kem/kyber768/aarch64/reduce.h index e16a894b..e64226f6 100644 --- a/crypto_kem/kyber768/aarch64/reduce.h +++ b/crypto_kem/kyber768/aarch64/reduce.h @@ -10,8 +10,8 @@ #include #include "params.h" -#define MONT -1044 // 2^16 mod q -#define QINV -3327 // q^-1 mod 2^16 +#define MONT (-1044) // 2^16 mod q +#define QINV (-3327) // q^-1 mod 2^16 #define montgomery_reduce KYBER_NAMESPACE(montgomery_reduce) int16_t montgomery_reduce(int32_t a); diff --git a/crypto_sign/dilithium2/aarch64/params.h b/crypto_sign/dilithium2/aarch64/params.h index ced52a7c..cf8e2985 100644 --- a/crypto_sign/dilithium2/aarch64/params.h +++ b/crypto_sign/dilithium2/aarch64/params.h @@ -85,11 +85,7 @@ #define POLYW1_PACKEDBYTES 128 #endif -#if ETA == 2 #define POLYETA_PACKEDBYTES 96 -#elif ETA == 4 -#define POLYETA_PACKEDBYTES 128 -#endif #define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) #define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ diff --git a/crypto_sign/dilithium2/aarch64/poly.c b/crypto_sign/dilithium2/aarch64/poly.c index 62721a38..2e5a1f30 100644 --- a/crypto_sign/dilithium2/aarch64/poly.c +++ b/crypto_sign/dilithium2/aarch64/poly.c @@ -467,7 +467,6 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - #if ETA == 2 if (t0 < 15) { t0 = t0 - (205 * t0 >> 10) * 5; @@ -478,20 +477,6 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 2 - t1; } - #elif ETA == 4 - - if (t0 < 9) { - a[ctr++] = 4 - t0; - } - if (t1 < 9 && ctr < len) { - a[ctr++] = 4 - t1; - } - - #else - -#error "No parameter specified!" - - #endif } @@ -510,11 +495,7 @@ static unsigned int rej_eta(int32_t *a, * - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#if ETA == 2 #define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) -#elif ETA == 4 -#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) -#endif void poly_uniform_eta(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { @@ -661,7 +642,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); - #if ETA == 2 for (i = 0; i < N / 8; ++i) { t[0] = ETA - a->coeffs[8 * i + 0]; @@ -678,19 +658,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } - #elif ETA == 4 - - for (i = 0; i < N / 2; ++i) { - t[0] = ETA - a->coeffs[2 * i + 0]; - t[1] = ETA - a->coeffs[2 * i + 1]; - r[i] = t[0] | (t[1] << 4); - } - - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -707,7 +674,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - #if ETA == 2 for (i = 0; i < N / 8; ++i) { r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; @@ -729,20 +695,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } - #elif ETA == 4 - - for (i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[i] & 0x0F; - r->coeffs[2 * i + 1] = a[i] >> 4; - r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; - } - - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium3/aarch64/params.h b/crypto_sign/dilithium3/aarch64/params.h index 646444d5..80ca9bf0 100644 --- a/crypto_sign/dilithium3/aarch64/params.h +++ b/crypto_sign/dilithium3/aarch64/params.h @@ -85,11 +85,7 @@ #define POLYW1_PACKEDBYTES 128 #endif -#if ETA == 2 -#define POLYETA_PACKEDBYTES 96 -#elif ETA == 4 #define POLYETA_PACKEDBYTES 128 -#endif #define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) #define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ diff --git a/crypto_sign/dilithium3/aarch64/poly.c b/crypto_sign/dilithium3/aarch64/poly.c index 687e14f9..8db56570 100644 --- a/crypto_sign/dilithium3/aarch64/poly.c +++ b/crypto_sign/dilithium3/aarch64/poly.c @@ -467,18 +467,6 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - #if ETA == 2 - - if (t0 < 15) { - t0 = t0 - (205 * t0 >> 10) * 5; - a[ctr++] = 2 - t0; - } - if (t1 < 15 && ctr < len) { - t1 = t1 - (205 * t1 >> 10) * 5; - a[ctr++] = 2 - t1; - } - - #elif ETA == 4 if (t0 < 9) { a[ctr++] = 4 - t0; @@ -487,11 +475,6 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 4 - t1; } - #else - -#error "No parameter specified!" - - #endif } @@ -510,11 +493,7 @@ static unsigned int rej_eta(int32_t *a, * - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#if ETA == 2 -#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) -#elif ETA == 4 #define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) -#endif void poly_uniform_eta(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { @@ -661,24 +640,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); - #if ETA == 2 - - for (i = 0; i < N / 8; ++i) { - t[0] = ETA - a->coeffs[8 * i + 0]; - t[1] = ETA - a->coeffs[8 * i + 1]; - t[2] = ETA - a->coeffs[8 * i + 2]; - t[3] = ETA - a->coeffs[8 * i + 3]; - t[4] = ETA - a->coeffs[8 * i + 4]; - t[5] = ETA - a->coeffs[8 * i + 5]; - t[6] = ETA - a->coeffs[8 * i + 6]; - t[7] = ETA - a->coeffs[8 * i + 7]; - - r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); - r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); - r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); - } - - #elif ETA == 4 for (i = 0; i < N / 2; ++i) { t[0] = ETA - a->coeffs[2 * i + 0]; @@ -686,11 +647,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { r[i] = t[0] | (t[1] << 4); } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -707,29 +663,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - #if ETA == 2 - - for (i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; - r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; - r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; - r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; - r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; - r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; - r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; - r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; - - r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; - r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; - r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; - r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; - r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; - r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; - r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; - r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; - } - - #elif ETA == 4 for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[i] & 0x0F; @@ -738,11 +671,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium5/aarch64/params.h b/crypto_sign/dilithium5/aarch64/params.h index 03e5322f..0888d612 100644 --- a/crypto_sign/dilithium5/aarch64/params.h +++ b/crypto_sign/dilithium5/aarch64/params.h @@ -85,11 +85,7 @@ #define POLYW1_PACKEDBYTES 128 #endif -#if ETA == 2 #define POLYETA_PACKEDBYTES 96 -#elif ETA == 4 -#define POLYETA_PACKEDBYTES 128 -#endif #define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) #define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ diff --git a/crypto_sign/dilithium5/aarch64/poly.c b/crypto_sign/dilithium5/aarch64/poly.c index 84b4487e..6850d699 100644 --- a/crypto_sign/dilithium5/aarch64/poly.c +++ b/crypto_sign/dilithium5/aarch64/poly.c @@ -467,7 +467,6 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - #if ETA == 2 if (t0 < 15) { t0 = t0 - (205 * t0 >> 10) * 5; @@ -478,20 +477,6 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 2 - t1; } - #elif ETA == 4 - - if (t0 < 9) { - a[ctr++] = 4 - t0; - } - if (t1 < 9 && ctr < len) { - a[ctr++] = 4 - t1; - } - - #else - -#error "No parameter specified!" - - #endif } @@ -510,11 +495,7 @@ static unsigned int rej_eta(int32_t *a, * - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#if ETA == 2 #define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) -#elif ETA == 4 -#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) -#endif void poly_uniform_eta(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { @@ -661,7 +642,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); - #if ETA == 2 for (i = 0; i < N / 8; ++i) { t[0] = ETA - a->coeffs[8 * i + 0]; @@ -678,19 +658,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } - #elif ETA == 4 - - for (i = 0; i < N / 2; ++i) { - t[0] = ETA - a->coeffs[2 * i + 0]; - t[1] = ETA - a->coeffs[2 * i + 1]; - r[i] = t[0] | (t[1] << 4); - } - - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -707,7 +674,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - #if ETA == 2 for (i = 0; i < N / 8; ++i) { r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; @@ -729,20 +695,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } - #elif ETA == 4 - - for (i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[i] & 0x0F; - r->coeffs[2 * i + 1] = a[i] >> 4; - r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; - } - - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } From 7d148ac6eba2c42adeedb18460412b1e8ff2662f Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 18:49:01 +0100 Subject: [PATCH 70/85] rm more #if --- crypto_sign/dilithium2/aarch64/params.h | 48 ++++--------------------- crypto_sign/dilithium3/aarch64/params.h | 42 ++-------------------- crypto_sign/dilithium5/aarch64/params.h | 46 +++--------------------- 3 files changed, 14 insertions(+), 122 deletions(-) diff --git a/crypto_sign/dilithium2/aarch64/params.h b/crypto_sign/dilithium2/aarch64/params.h index cf8e2985..2eca982f 100644 --- a/crypto_sign/dilithium2/aarch64/params.h +++ b/crypto_sign/dilithium2/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_PARAMS_H +#define PQCLEAN_DILITHIUM2_AARCH64_PARAMS_H /* * This file is dual licensed @@ -8,8 +8,8 @@ */ #define DILITHIUM_MODE 2 -//#define DILITHIUM_MODE 3 -//#define DILITHIUM_MODE 5 +// #define DILITHIUM_MODE 3 +// #define DILITHIUM_MODE 5 #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM2_AARCH64_##s #define CRYPTO_NAMESPACETOP crypto_sign @@ -25,7 +25,6 @@ #define D 13 #define ROOT_OF_UNITY 1753 -#if DILITHIUM_MODE == 2 #define K 4 #define L 4 @@ -37,35 +36,6 @@ #define OMEGA 80 #define CRYPTO_ALGNAME "Dilithium2" #define CTILDEBYTES 32 -#elif DILITHIUM_MODE == 3 - -#define K 6 -#define L 5 -#define ETA 4 -#define TAU 49 -#define BETA 196 -#define GAMMA1 (1 << 19) -#define GAMMA2 ((DILITHIUM_Q-1)/32) -#define OMEGA 55 -#define CRYPTO_ALGNAME "Dilithium3" -#define CTILDEBYTES 48 -#elif DILITHIUM_MODE == 5 - -#define K 8 -#define L 7 -#define ETA 2 -#define TAU 60 -#define BETA 120 -#define GAMMA1 (1 << 19) -#define GAMMA2 ((DILITHIUM_Q-1)/32) -#define OMEGA 75 -#define CRYPTO_ALGNAME "Dilithium5" -#define CTILDEBYTES 64 -#else - -#error "No parameter specified!" - -#endif @@ -73,17 +43,11 @@ #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) -#if GAMMA1 == (1 << 17) +// GAMMA1 == (1 << 17) #define POLYZ_PACKEDBYTES 576 -#elif GAMMA1 == (1 << 19) -#define POLYZ_PACKEDBYTES 640 -#endif -#if GAMMA2 == (DILITHIUM_Q-1)/88 +// GAMMA2 == (DILITHIUM_Q-1)/88 #define POLYW1_PACKEDBYTES 192 -#elif GAMMA2 == (DILITHIUM_Q-1)/32 -#define POLYW1_PACKEDBYTES 128 -#endif #define POLYETA_PACKEDBYTES 96 diff --git a/crypto_sign/dilithium3/aarch64/params.h b/crypto_sign/dilithium3/aarch64/params.h index 80ca9bf0..52803392 100644 --- a/crypto_sign/dilithium3/aarch64/params.h +++ b/crypto_sign/dilithium3/aarch64/params.h @@ -9,7 +9,7 @@ // #define DILITHIUM_MODE 2 #define DILITHIUM_MODE 3 -//#define DILITHIUM_MODE 5 +// #define DILITHIUM_MODE 5 #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM3_AARCH64_##s #define CRYPTO_NAMESPACETOP crypto_sign @@ -25,19 +25,6 @@ #define D 13 #define ROOT_OF_UNITY 1753 -#if DILITHIUM_MODE == 2 - -#define K 4 -#define L 4 -#define ETA 2 -#define TAU 39 -#define BETA 78 -#define GAMMA1 (1 << 17) -#define GAMMA2 ((DILITHIUM_Q-1)/88) -#define OMEGA 80 -#define CRYPTO_ALGNAME "Dilithium2" -#define CTILDEBYTES 32 -#elif DILITHIUM_MODE == 3 #define K 6 #define L 5 @@ -49,23 +36,6 @@ #define OMEGA 55 #define CRYPTO_ALGNAME "Dilithium3" #define CTILDEBYTES 48 -#elif DILITHIUM_MODE == 5 - -#define K 8 -#define L 7 -#define ETA 2 -#define TAU 60 -#define BETA 120 -#define GAMMA1 (1 << 19) -#define GAMMA2 ((DILITHIUM_Q-1)/32) -#define OMEGA 75 -#define CRYPTO_ALGNAME "Dilithium5" -#define CTILDEBYTES 64 -#else - -#error "No parameter specified!" - -#endif @@ -73,17 +43,11 @@ #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) -#if GAMMA1 == (1 << 17) -#define POLYZ_PACKEDBYTES 576 -#elif GAMMA1 == (1 << 19) +// GAMMA1 == (1 << 19) #define POLYZ_PACKEDBYTES 640 -#endif -#if GAMMA2 == (DILITHIUM_Q-1)/88 -#define POLYW1_PACKEDBYTES 192 -#elif GAMMA2 == (DILITHIUM_Q-1)/32 +// GAMMA2 == (DILITHIUM_Q-1)/32 #define POLYW1_PACKEDBYTES 128 -#endif #define POLYETA_PACKEDBYTES 128 diff --git a/crypto_sign/dilithium5/aarch64/params.h b/crypto_sign/dilithium5/aarch64/params.h index 0888d612..e86fc797 100644 --- a/crypto_sign/dilithium5/aarch64/params.h +++ b/crypto_sign/dilithium5/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_PARAMS_H +#define PQCLEAN_DILITHIUM5_AARCH64_PARAMS_H /* * This file is dual licensed @@ -8,7 +8,7 @@ */ // #define DILITHIUM_MODE 2 -//#define DILITHIUM_MODE 3 +// #define DILITHIUM_MODE 3 #define DILITHIUM_MODE 5 #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM5_AARCH64_##s @@ -25,31 +25,6 @@ #define D 13 #define ROOT_OF_UNITY 1753 -#if DILITHIUM_MODE == 2 - -#define K 4 -#define L 4 -#define ETA 2 -#define TAU 39 -#define BETA 78 -#define GAMMA1 (1 << 17) -#define GAMMA2 ((DILITHIUM_Q-1)/88) -#define OMEGA 80 -#define CRYPTO_ALGNAME "Dilithium2" -#define CTILDEBYTES 32 -#elif DILITHIUM_MODE == 3 - -#define K 6 -#define L 5 -#define ETA 4 -#define TAU 49 -#define BETA 196 -#define GAMMA1 (1 << 19) -#define GAMMA2 ((DILITHIUM_Q-1)/32) -#define OMEGA 55 -#define CRYPTO_ALGNAME "Dilithium3" -#define CTILDEBYTES 48 -#elif DILITHIUM_MODE == 5 #define K 8 #define L 7 @@ -61,11 +36,6 @@ #define OMEGA 75 #define CRYPTO_ALGNAME "Dilithium5" #define CTILDEBYTES 64 -#else - -#error "No parameter specified!" - -#endif @@ -73,17 +43,11 @@ #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) -#if GAMMA1 == (1 << 17) -#define POLYZ_PACKEDBYTES 576 -#elif GAMMA1 == (1 << 19) +// GAMMA1 == (1 << 19) #define POLYZ_PACKEDBYTES 640 -#endif -#if GAMMA2 == (DILITHIUM_Q-1)/88 -#define POLYW1_PACKEDBYTES 192 -#elif GAMMA2 == (DILITHIUM_Q-1)/32 +// GAMMA2 == (DILITHIUM_Q-1)/88 #define POLYW1_PACKEDBYTES 128 -#endif #define POLYETA_PACKEDBYTES 96 From 3573ca17d5e248ae0aee4c1b841e0e4ea7a11dc6 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 20 Nov 2023 18:56:48 +0100 Subject: [PATCH 71/85] rm more #if --- crypto_sign/dilithium2/aarch64/poly.c | 55 ---------------- crypto_sign/dilithium2/aarch64/rounding.c | 21 ------ crypto_sign/dilithium3/aarch64/poly.c | 80 ----------------------- crypto_sign/dilithium3/aarch64/rounding.c | 21 ------ crypto_sign/dilithium5/aarch64/poly.c | 80 ----------------------- crypto_sign/dilithium5/aarch64/rounding.c | 21 ------ 6 files changed, 278 deletions(-) diff --git a/crypto_sign/dilithium2/aarch64/poly.c b/crypto_sign/dilithium2/aarch64/poly.c index 2e5a1f30..05dc1710 100644 --- a/crypto_sign/dilithium2/aarch64/poly.c +++ b/crypto_sign/dilithium2/aarch64/poly.c @@ -868,7 +868,6 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); - #if GAMMA1 == (1 << 17) for (i = 0; i < N / 4; ++i) { t[0] = GAMMA1 - a->coeffs[4 * i + 0]; @@ -890,25 +889,6 @@ void polyz_pack(uint8_t *r, const poly *a) { r[9 * i + 8] = t[3] >> 10; } - #elif GAMMA1 == (1 << 19) - - for (i = 0; i < N / 2; ++i) { - t[0] = GAMMA1 - a->coeffs[2 * i + 0]; - t[1] = GAMMA1 - a->coeffs[2 * i + 1]; - - r[5 * i + 0] = t[0]; - r[5 * i + 1] = t[0] >> 8; - r[5 * i + 2] = t[0] >> 16; - r[5 * i + 2] |= t[1] << 4; - r[5 * i + 3] = t[1] >> 4; - r[5 * i + 4] = t[1] >> 12; - } - - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -926,7 +906,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - #if GAMMA1 == (1 << 17) for (i = 0; i < N / 4; ++i) { r->coeffs[4 * i + 0] = a[9 * i + 0]; @@ -955,28 +934,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; } - #elif GAMMA1 == (1 << 19) - - for (i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16; - r->coeffs[2 * i + 0] &= 0xFFFFF; - - r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; - r->coeffs[2 * i + 0] &= 0xFFFFF; - - r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; - } - - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -995,7 +952,6 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); - #if GAMMA2 == (DILITHIUM_Q-1)/88 for (i = 0; i < N / 4; ++i) { r[3 * i + 0] = a->coeffs[4 * i + 0]; @@ -1006,17 +962,6 @@ void polyw1_pack(uint8_t *r, const poly *a) { r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2; } - #elif GAMMA2 == (DILITHIUM_Q-1)/32 - - for (i = 0; i < N / 2; ++i) { - r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); - } - - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium2/aarch64/rounding.c b/crypto_sign/dilithium2/aarch64/rounding.c index 30c97510..342e9683 100644 --- a/crypto_sign/dilithium2/aarch64/rounding.c +++ b/crypto_sign/dilithium2/aarch64/rounding.c @@ -47,21 +47,10 @@ int32_t decompose(int32_t *a0, int32_t a) { int32_t a1; a1 = (a + 127) >> 7; - #if GAMMA2 == (DILITHIUM_Q-1)/32 - - a1 = (a1 * 1025 + (1 << 21)) >> 22; - a1 &= 15; - - #elif GAMMA2 == (DILITHIUM_Q-1)/88 a1 = (a1 * 11275 + (1 << 23)) >> 24; a1 ^= ((43 - a1) >> 31) & a1; - #else - -#error "No parameter specified" - - #endif *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; @@ -105,15 +94,6 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } - #if GAMMA2 == (DILITHIUM_Q-1)/32 - - if (a0 > 0) { - return (a1 + 1) & 15; - } else { - return (a1 - 1) & 15; - } - - #elif GAMMA2 == (DILITHIUM_Q-1)/88 if (a0 > 0) { return (a1 == 43) ? 0 : a1 + 1; @@ -121,6 +101,5 @@ int32_t use_hint(int32_t a, unsigned int hint) { return (a1 == 0) ? 43 : a1 - 1; } - #endif } diff --git a/crypto_sign/dilithium3/aarch64/poly.c b/crypto_sign/dilithium3/aarch64/poly.c index 8db56570..5399da4b 100644 --- a/crypto_sign/dilithium3/aarch64/poly.c +++ b/crypto_sign/dilithium3/aarch64/poly.c @@ -844,29 +844,6 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); - #if GAMMA1 == (1 << 17) - - for (i = 0; i < N / 4; ++i) { - t[0] = GAMMA1 - a->coeffs[4 * i + 0]; - t[1] = GAMMA1 - a->coeffs[4 * i + 1]; - t[2] = GAMMA1 - a->coeffs[4 * i + 2]; - t[3] = GAMMA1 - a->coeffs[4 * i + 3]; - - r[9 * i + 0] = t[0]; - r[9 * i + 1] = t[0] >> 8; - r[9 * i + 2] = t[0] >> 16; - r[9 * i + 2] |= t[1] << 2; - r[9 * i + 3] = t[1] >> 6; - r[9 * i + 4] = t[1] >> 14; - r[9 * i + 4] |= t[2] << 4; - r[9 * i + 5] = t[2] >> 4; - r[9 * i + 6] = t[2] >> 12; - r[9 * i + 6] |= t[3] << 6; - r[9 * i + 7] = t[3] >> 2; - r[9 * i + 8] = t[3] >> 10; - } - - #elif GAMMA1 == (1 << 19) for (i = 0; i < N / 2; ++i) { t[0] = GAMMA1 - a->coeffs[2 * i + 0]; @@ -880,11 +857,6 @@ void polyz_pack(uint8_t *r, const poly *a) { r[5 * i + 4] = t[1] >> 12; } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -902,36 +874,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - #if GAMMA1 == (1 << 17) - - for (i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[9 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; - r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; - r->coeffs[4 * i + 0] &= 0x3FFFF; - - r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; - r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; - r->coeffs[4 * i + 1] &= 0x3FFFF; - - r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; - r->coeffs[4 * i + 2] &= 0x3FFFF; - - r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; - r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; - r->coeffs[4 * i + 3] &= 0x3FFFF; - - r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; - } - - #elif GAMMA1 == (1 << 19) for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; @@ -948,11 +890,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -971,28 +908,11 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); - #if GAMMA2 == (DILITHIUM_Q-1)/88 - - for (i = 0; i < N / 4; ++i) { - r[3 * i + 0] = a->coeffs[4 * i + 0]; - r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6; - r[3 * i + 1] = a->coeffs[4 * i + 1] >> 2; - r[3 * i + 1] |= a->coeffs[4 * i + 2] << 4; - r[3 * i + 2] = a->coeffs[4 * i + 2] >> 4; - r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2; - } - - #elif GAMMA2 == (DILITHIUM_Q-1)/32 for (i = 0; i < N / 2; ++i) { r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium3/aarch64/rounding.c b/crypto_sign/dilithium3/aarch64/rounding.c index 30c97510..b0068bd4 100644 --- a/crypto_sign/dilithium3/aarch64/rounding.c +++ b/crypto_sign/dilithium3/aarch64/rounding.c @@ -47,21 +47,10 @@ int32_t decompose(int32_t *a0, int32_t a) { int32_t a1; a1 = (a + 127) >> 7; - #if GAMMA2 == (DILITHIUM_Q-1)/32 a1 = (a1 * 1025 + (1 << 21)) >> 22; a1 &= 15; - #elif GAMMA2 == (DILITHIUM_Q-1)/88 - - a1 = (a1 * 11275 + (1 << 23)) >> 24; - a1 ^= ((43 - a1) >> 31) & a1; - - #else - -#error "No parameter specified" - - #endif *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; @@ -105,7 +94,6 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } - #if GAMMA2 == (DILITHIUM_Q-1)/32 if (a0 > 0) { return (a1 + 1) & 15; @@ -113,14 +101,5 @@ int32_t use_hint(int32_t a, unsigned int hint) { return (a1 - 1) & 15; } - #elif GAMMA2 == (DILITHIUM_Q-1)/88 - - if (a0 > 0) { - return (a1 == 43) ? 0 : a1 + 1; - } else { - return (a1 == 0) ? 43 : a1 - 1; - } - - #endif } diff --git a/crypto_sign/dilithium5/aarch64/poly.c b/crypto_sign/dilithium5/aarch64/poly.c index 6850d699..ad3b1109 100644 --- a/crypto_sign/dilithium5/aarch64/poly.c +++ b/crypto_sign/dilithium5/aarch64/poly.c @@ -868,29 +868,6 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); - #if GAMMA1 == (1 << 17) - - for (i = 0; i < N / 4; ++i) { - t[0] = GAMMA1 - a->coeffs[4 * i + 0]; - t[1] = GAMMA1 - a->coeffs[4 * i + 1]; - t[2] = GAMMA1 - a->coeffs[4 * i + 2]; - t[3] = GAMMA1 - a->coeffs[4 * i + 3]; - - r[9 * i + 0] = t[0]; - r[9 * i + 1] = t[0] >> 8; - r[9 * i + 2] = t[0] >> 16; - r[9 * i + 2] |= t[1] << 2; - r[9 * i + 3] = t[1] >> 6; - r[9 * i + 4] = t[1] >> 14; - r[9 * i + 4] |= t[2] << 4; - r[9 * i + 5] = t[2] >> 4; - r[9 * i + 6] = t[2] >> 12; - r[9 * i + 6] |= t[3] << 6; - r[9 * i + 7] = t[3] >> 2; - r[9 * i + 8] = t[3] >> 10; - } - - #elif GAMMA1 == (1 << 19) for (i = 0; i < N / 2; ++i) { t[0] = GAMMA1 - a->coeffs[2 * i + 0]; @@ -904,11 +881,6 @@ void polyz_pack(uint8_t *r, const poly *a) { r[5 * i + 4] = t[1] >> 12; } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -926,36 +898,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - #if GAMMA1 == (1 << 17) - - for (i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[9 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; - r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; - r->coeffs[4 * i + 0] &= 0x3FFFF; - - r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; - r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; - r->coeffs[4 * i + 1] &= 0x3FFFF; - - r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; - r->coeffs[4 * i + 2] &= 0x3FFFF; - - r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; - r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; - r->coeffs[4 * i + 3] &= 0x3FFFF; - - r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; - } - - #elif GAMMA1 == (1 << 19) for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; @@ -972,11 +914,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } @@ -995,28 +932,11 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); - #if GAMMA2 == (DILITHIUM_Q-1)/88 - - for (i = 0; i < N / 4; ++i) { - r[3 * i + 0] = a->coeffs[4 * i + 0]; - r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6; - r[3 * i + 1] = a->coeffs[4 * i + 1] >> 2; - r[3 * i + 1] |= a->coeffs[4 * i + 2] << 4; - r[3 * i + 2] = a->coeffs[4 * i + 2] >> 4; - r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2; - } - - #elif GAMMA2 == (DILITHIUM_Q-1)/32 for (i = 0; i < N / 2; ++i) { r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); } - #else - -#error "No parameter specified!" - - #endif DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium5/aarch64/rounding.c b/crypto_sign/dilithium5/aarch64/rounding.c index 30c97510..b0068bd4 100644 --- a/crypto_sign/dilithium5/aarch64/rounding.c +++ b/crypto_sign/dilithium5/aarch64/rounding.c @@ -47,21 +47,10 @@ int32_t decompose(int32_t *a0, int32_t a) { int32_t a1; a1 = (a + 127) >> 7; - #if GAMMA2 == (DILITHIUM_Q-1)/32 a1 = (a1 * 1025 + (1 << 21)) >> 22; a1 &= 15; - #elif GAMMA2 == (DILITHIUM_Q-1)/88 - - a1 = (a1 * 11275 + (1 << 23)) >> 24; - a1 ^= ((43 - a1) >> 31) & a1; - - #else - -#error "No parameter specified" - - #endif *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; @@ -105,7 +94,6 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } - #if GAMMA2 == (DILITHIUM_Q-1)/32 if (a0 > 0) { return (a1 + 1) & 15; @@ -113,14 +101,5 @@ int32_t use_hint(int32_t a, unsigned int hint) { return (a1 - 1) & 15; } - #elif GAMMA2 == (DILITHIUM_Q-1)/88 - - if (a0 > 0) { - return (a1 == 43) ? 0 : a1 + 1; - } else { - return (a1 == 0) ? 43 : a1 - 1; - } - - #endif } From 285ec75438e4c40098707d318f2a75d54ea40af6 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Sun, 31 Dec 2023 01:10:41 +0800 Subject: [PATCH 72/85] extract fips202x2, feat --- {crypto_kem/kyber768/aarch64 => common}/feat.S | 0 {crypto_kem/kyber768/aarch64 => common}/fips202x2.c | 0 {crypto_kem/kyber768/aarch64 => common}/fips202x2.h | 0 crypto_kem/kyber1024/aarch64/Makefile | 4 ++-- crypto_kem/kyber512/aarch64/Makefile | 4 ++-- crypto_kem/kyber768/aarch64/Makefile | 4 ++-- crypto_sign/dilithium2/aarch64/Makefile | 4 ++-- crypto_sign/dilithium3/aarch64/Makefile | 4 ++-- crypto_sign/dilithium5/aarch64/Makefile | 4 ++-- 9 files changed, 12 insertions(+), 12 deletions(-) rename {crypto_kem/kyber768/aarch64 => common}/feat.S (100%) rename {crypto_kem/kyber768/aarch64 => common}/fips202x2.c (100%) rename {crypto_kem/kyber768/aarch64 => common}/fips202x2.h (100%) diff --git a/crypto_kem/kyber768/aarch64/feat.S b/common/feat.S similarity index 100% rename from crypto_kem/kyber768/aarch64/feat.S rename to common/feat.S diff --git a/crypto_kem/kyber768/aarch64/fips202x2.c b/common/fips202x2.c similarity index 100% rename from crypto_kem/kyber768/aarch64/fips202x2.c rename to common/fips202x2.c diff --git a/crypto_kem/kyber768/aarch64/fips202x2.h b/common/fips202x2.h similarity index 100% rename from crypto_kem/kyber768/aarch64/fips202x2.h rename to common/fips202x2.h diff --git a/crypto_kem/kyber1024/aarch64/Makefile b/crypto_kem/kyber1024/aarch64/Makefile index 82aded27..21ba6461 100644 --- a/crypto_kem/kyber1024/aarch64/Makefile +++ b/crypto_kem/kyber1024/aarch64/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber1024_aarch64.a -HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h -OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o feat.o +HEADERS=api.h cbd.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h +OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber512/aarch64/Makefile b/crypto_kem/kyber512/aarch64/Makefile index c6affa40..29330759 100644 --- a/crypto_kem/kyber512/aarch64/Makefile +++ b/crypto_kem/kyber512/aarch64/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber512_aarch64.a -HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h -OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o feat.o +HEADERS=api.h cbd.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h +OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber768/aarch64/Makefile b/crypto_kem/kyber768/aarch64/Makefile index e2d24a69..26a76ea5 100644 --- a/crypto_kem/kyber768/aarch64/Makefile +++ b/crypto_kem/kyber768/aarch64/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber768_aarch64.a -HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h -OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o feat.o +HEADERS=api.h cbd.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h +OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_sign/dilithium2/aarch64/Makefile b/crypto_sign/dilithium2/aarch64/Makefile index 2e951038..c443defa 100644 --- a/crypto_sign/dilithium2/aarch64/Makefile +++ b/crypto_sign/dilithium2/aarch64/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium2_aarch64.a -HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h -OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o feat.o +HEADERS=api.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g diff --git a/crypto_sign/dilithium3/aarch64/Makefile b/crypto_sign/dilithium3/aarch64/Makefile index 3c2ad454..ef1c358a 100644 --- a/crypto_sign/dilithium3/aarch64/Makefile +++ b/crypto_sign/dilithium3/aarch64/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium3_aarch64.a -HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h -OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o feat.o +HEADERS=api.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g diff --git a/crypto_sign/dilithium5/aarch64/Makefile b/crypto_sign/dilithium5/aarch64/Makefile index 8a156266..006d82b2 100644 --- a/crypto_sign/dilithium5/aarch64/Makefile +++ b/crypto_sign/dilithium5/aarch64/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium5_aarch64.a -HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h -OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o feat.o +HEADERS=api.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g From 43aeb28e4579fe3084a794e3ab4eee2c5769a487 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Sun, 31 Dec 2023 01:51:25 +0800 Subject: [PATCH 73/85] add missing stdint --- common/fips202x2.c | 1 - common/fips202x2.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/common/fips202x2.c b/common/fips202x2.c index c8ebcd36..11994be8 100644 --- a/common/fips202x2.c +++ b/common/fips202x2.c @@ -34,7 +34,6 @@ */ #include -#include #include "fips202x2.h" diff --git a/common/fips202x2.h b/common/fips202x2.h index 3066c52b..1274a6d2 100644 --- a/common/fips202x2.h +++ b/common/fips202x2.h @@ -9,6 +9,7 @@ */ #include +#include #include typedef uint64x2_t v128; From 8d45604bfaa833c6895b8529feaf3fd4297fe395 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Sun, 31 Dec 2023 01:56:04 +0800 Subject: [PATCH 74/85] include local first --- common/fips202x2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/fips202x2.c b/common/fips202x2.c index 11994be8..d0e8efdc 100644 --- a/common/fips202x2.c +++ b/common/fips202x2.c @@ -33,9 +33,8 @@ * SOFTWARE. */ -#include #include "fips202x2.h" - +#include #define NROUNDS 24 From d1f5339ec7ac5d8750cab5ddcc13b07ec344e7fd Mon Sep 17 00:00:00 2001 From: Thom Wiggers Date: Tue, 2 Jan 2024 09:58:12 +0100 Subject: [PATCH 75/85] Move keccak2x into own folder to avoid conflicts --- common/{ => keccak2x}/fips202x2.c | 0 common/{ => keccak2x}/fips202x2.h | 0 crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c | 2 +- crypto_kem/kyber1024/aarch64/symmetric.h | 2 +- crypto_kem/kyber512/aarch64/neon_symmetric-shake.c | 2 +- crypto_kem/kyber512/aarch64/symmetric.h | 2 +- crypto_kem/kyber768/aarch64/neon_symmetric-shake.c | 2 +- crypto_kem/kyber768/aarch64/symmetric.h | 2 +- 8 files changed, 6 insertions(+), 6 deletions(-) rename common/{ => keccak2x}/fips202x2.c (100%) rename common/{ => keccak2x}/fips202x2.h (100%) diff --git a/common/fips202x2.c b/common/keccak2x/fips202x2.c similarity index 100% rename from common/fips202x2.c rename to common/keccak2x/fips202x2.c diff --git a/common/fips202x2.h b/common/keccak2x/fips202x2.h similarity index 100% rename from common/fips202x2.h rename to common/keccak2x/fips202x2.h diff --git a/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c b/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c index aa096294..9a59724e 100644 --- a/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c +++ b/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c @@ -36,7 +36,7 @@ #include #include #include "params.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "symmetric.h" /************************************************* diff --git a/crypto_kem/kyber1024/aarch64/symmetric.h b/crypto_kem/kyber1024/aarch64/symmetric.h index 0c2dd991..715248da 100644 --- a/crypto_kem/kyber1024/aarch64/symmetric.h +++ b/crypto_kem/kyber1024/aarch64/symmetric.h @@ -12,7 +12,7 @@ #include #include "params.h" -#include "fips202.h" +#include "keccak2x/fips202.h" typedef shake128ctx xof_state; diff --git a/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c b/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c index aa096294..9a59724e 100644 --- a/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c +++ b/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c @@ -36,7 +36,7 @@ #include #include #include "params.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "symmetric.h" /************************************************* diff --git a/crypto_kem/kyber512/aarch64/symmetric.h b/crypto_kem/kyber512/aarch64/symmetric.h index 3c4c5074..5fef65f6 100644 --- a/crypto_kem/kyber512/aarch64/symmetric.h +++ b/crypto_kem/kyber512/aarch64/symmetric.h @@ -12,7 +12,7 @@ #include #include "params.h" -#include "fips202.h" +#include "keccak2x/fips202.h" typedef shake128ctx xof_state; diff --git a/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c b/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c index aa096294..9a59724e 100644 --- a/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c +++ b/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c @@ -36,7 +36,7 @@ #include #include #include "params.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "symmetric.h" /************************************************* diff --git a/crypto_kem/kyber768/aarch64/symmetric.h b/crypto_kem/kyber768/aarch64/symmetric.h index 336fe4da..55c0fdd9 100644 --- a/crypto_kem/kyber768/aarch64/symmetric.h +++ b/crypto_kem/kyber768/aarch64/symmetric.h @@ -12,7 +12,7 @@ #include #include "params.h" -#include "fips202.h" +#include "keccak2x/fips202.h" typedef shake128ctx xof_state; From 6760e751c3d724ea09b5d3675145a3be8007dacd Mon Sep 17 00:00:00 2001 From: Thom Wiggers Date: Tue, 2 Jan 2024 12:19:35 +0100 Subject: [PATCH 76/85] Apply Astyle --- crypto_kem/kyber1024/aarch64/fips202x2.c | 1 - crypto_kem/kyber1024/aarch64/kem.c | 3 --- crypto_kem/kyber1024/aarch64/ntt.c | 6 +++--- crypto_kem/kyber1024/aarch64/ntt.h | 2 -- crypto_kem/kyber1024/aarch64/symmetric-shake.c | 2 -- crypto_kem/kyber1024/aarch64/symmetric.h | 4 ---- crypto_kem/kyber512/aarch64/fips202x2.c | 1 - crypto_kem/kyber512/aarch64/kem.c | 3 --- crypto_kem/kyber512/aarch64/ntt.c | 6 +++--- crypto_kem/kyber512/aarch64/ntt.h | 2 -- crypto_kem/kyber512/aarch64/symmetric-shake.c | 2 -- crypto_kem/kyber512/aarch64/symmetric.h | 4 ---- crypto_kem/kyber768/aarch64/kem.c | 3 --- crypto_kem/kyber768/aarch64/ntt.c | 6 +++--- crypto_kem/kyber768/aarch64/ntt.h | 2 -- crypto_kem/kyber768/aarch64/symmetric-shake.c | 2 -- crypto_kem/kyber768/aarch64/symmetric.h | 4 ---- 17 files changed, 9 insertions(+), 44 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/fips202x2.c b/crypto_kem/kyber1024/aarch64/fips202x2.c index c8ebcd36..054c3140 100644 --- a/crypto_kem/kyber1024/aarch64/fips202x2.c +++ b/crypto_kem/kyber1024/aarch64/fips202x2.c @@ -37,7 +37,6 @@ #include #include "fips202x2.h" - #define NROUNDS 24 // Define NEON operation diff --git a/crypto_kem/kyber1024/aarch64/kem.c b/crypto_kem/kyber1024/aarch64/kem.c index d694befa..572b5e93 100644 --- a/crypto_kem/kyber1024/aarch64/kem.c +++ b/crypto_kem/kyber1024/aarch64/kem.c @@ -17,7 +17,6 @@ #include "symmetric.h" #include "randombytes.h" - /************************************************* * Name: crypto_kem_keypair_derand * @@ -65,8 +64,6 @@ int crypto_kem_keypair(uint8_t *pk, return 0; } - - /************************************************* * Name: crypto_kem_enc_derand * diff --git a/crypto_kem/kyber1024/aarch64/ntt.c b/crypto_kem/kyber1024/aarch64/ntt.c index 69cb756f..09583b73 100644 --- a/crypto_kem/kyber1024/aarch64/ntt.c +++ b/crypto_kem/kyber1024/aarch64/ntt.c @@ -46,15 +46,15 @@ const __attribute__ ((aligned (16)))int16_t constants[16] = { }; const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { -0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 + 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 }; const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { -167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 }; const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { -0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 + 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 }; /************************************************* diff --git a/crypto_kem/kyber1024/aarch64/ntt.h b/crypto_kem/kyber1024/aarch64/ntt.h index 5a18158c..cd74b9a8 100644 --- a/crypto_kem/kyber1024/aarch64/ntt.h +++ b/crypto_kem/kyber1024/aarch64/ntt.h @@ -61,7 +61,6 @@ const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; extern const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; - #define NTT(in) do { \ PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ @@ -77,5 +76,4 @@ void ntt(int16_t r[256]); #define invntt KYBER_NAMESPACE(invntt) void invntt(int16_t r[256]); - #endif diff --git a/crypto_kem/kyber1024/aarch64/symmetric-shake.c b/crypto_kem/kyber1024/aarch64/symmetric-shake.c index 14a4c28c..067922ec 100644 --- a/crypto_kem/kyber1024/aarch64/symmetric-shake.c +++ b/crypto_kem/kyber1024/aarch64/symmetric-shake.c @@ -55,8 +55,6 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM shake256(out, outlen, extkey, sizeof(extkey)); } - - /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf * diff --git a/crypto_kem/kyber1024/aarch64/symmetric.h b/crypto_kem/kyber1024/aarch64/symmetric.h index 715248da..1eb92e61 100644 --- a/crypto_kem/kyber1024/aarch64/symmetric.h +++ b/crypto_kem/kyber1024/aarch64/symmetric.h @@ -27,7 +27,6 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf) void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]); - #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) @@ -37,7 +36,6 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) - // NEON Definition #include "fips202x2.h" @@ -67,5 +65,3 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, shake128x2_squeezeblocks(OUT0, OUT1, OUTBLOCKS, STATE) #endif /* SYMMETRIC_H */ - - diff --git a/crypto_kem/kyber512/aarch64/fips202x2.c b/crypto_kem/kyber512/aarch64/fips202x2.c index c8ebcd36..054c3140 100644 --- a/crypto_kem/kyber512/aarch64/fips202x2.c +++ b/crypto_kem/kyber512/aarch64/fips202x2.c @@ -37,7 +37,6 @@ #include #include "fips202x2.h" - #define NROUNDS 24 // Define NEON operation diff --git a/crypto_kem/kyber512/aarch64/kem.c b/crypto_kem/kyber512/aarch64/kem.c index d694befa..572b5e93 100644 --- a/crypto_kem/kyber512/aarch64/kem.c +++ b/crypto_kem/kyber512/aarch64/kem.c @@ -17,7 +17,6 @@ #include "symmetric.h" #include "randombytes.h" - /************************************************* * Name: crypto_kem_keypair_derand * @@ -65,8 +64,6 @@ int crypto_kem_keypair(uint8_t *pk, return 0; } - - /************************************************* * Name: crypto_kem_enc_derand * diff --git a/crypto_kem/kyber512/aarch64/ntt.c b/crypto_kem/kyber512/aarch64/ntt.c index 69cb756f..09583b73 100644 --- a/crypto_kem/kyber512/aarch64/ntt.c +++ b/crypto_kem/kyber512/aarch64/ntt.c @@ -46,15 +46,15 @@ const __attribute__ ((aligned (16)))int16_t constants[16] = { }; const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { -0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 + 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 }; const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { -167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 }; const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { -0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 + 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 }; /************************************************* diff --git a/crypto_kem/kyber512/aarch64/ntt.h b/crypto_kem/kyber512/aarch64/ntt.h index 141fa225..43307eb2 100644 --- a/crypto_kem/kyber512/aarch64/ntt.h +++ b/crypto_kem/kyber512/aarch64/ntt.h @@ -61,7 +61,6 @@ const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; extern const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; - #define NTT(in) do { \ PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ @@ -77,5 +76,4 @@ void ntt(int16_t r[256]); #define invntt KYBER_NAMESPACE(invntt) void invntt(int16_t r[256]); - #endif diff --git a/crypto_kem/kyber512/aarch64/symmetric-shake.c b/crypto_kem/kyber512/aarch64/symmetric-shake.c index 14a4c28c..067922ec 100644 --- a/crypto_kem/kyber512/aarch64/symmetric-shake.c +++ b/crypto_kem/kyber512/aarch64/symmetric-shake.c @@ -55,8 +55,6 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM shake256(out, outlen, extkey, sizeof(extkey)); } - - /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf * diff --git a/crypto_kem/kyber512/aarch64/symmetric.h b/crypto_kem/kyber512/aarch64/symmetric.h index 5fef65f6..3a6e0310 100644 --- a/crypto_kem/kyber512/aarch64/symmetric.h +++ b/crypto_kem/kyber512/aarch64/symmetric.h @@ -27,7 +27,6 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf) void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]); - #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) @@ -37,7 +36,6 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) - // NEON Definition #include "fips202x2.h" @@ -67,5 +65,3 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, shake128x2_squeezeblocks(OUT0, OUT1, OUTBLOCKS, STATE) #endif /* SYMMETRIC_H */ - - diff --git a/crypto_kem/kyber768/aarch64/kem.c b/crypto_kem/kyber768/aarch64/kem.c index d694befa..572b5e93 100644 --- a/crypto_kem/kyber768/aarch64/kem.c +++ b/crypto_kem/kyber768/aarch64/kem.c @@ -17,7 +17,6 @@ #include "symmetric.h" #include "randombytes.h" - /************************************************* * Name: crypto_kem_keypair_derand * @@ -65,8 +64,6 @@ int crypto_kem_keypair(uint8_t *pk, return 0; } - - /************************************************* * Name: crypto_kem_enc_derand * diff --git a/crypto_kem/kyber768/aarch64/ntt.c b/crypto_kem/kyber768/aarch64/ntt.c index 69cb756f..09583b73 100644 --- a/crypto_kem/kyber768/aarch64/ntt.c +++ b/crypto_kem/kyber768/aarch64/ntt.c @@ -46,15 +46,15 @@ const __attribute__ ((aligned (16)))int16_t constants[16] = { }; const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { -0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 + 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 }; const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { -167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 }; const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { -0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 + 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 }; /************************************************* diff --git a/crypto_kem/kyber768/aarch64/ntt.h b/crypto_kem/kyber768/aarch64/ntt.h index cd5fd984..3cf77b53 100644 --- a/crypto_kem/kyber768/aarch64/ntt.h +++ b/crypto_kem/kyber768/aarch64/ntt.h @@ -61,7 +61,6 @@ const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; extern const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; - #define NTT(in) do { \ PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ @@ -77,5 +76,4 @@ void ntt(int16_t r[256]); #define invntt KYBER_NAMESPACE(invntt) void invntt(int16_t r[256]); - #endif diff --git a/crypto_kem/kyber768/aarch64/symmetric-shake.c b/crypto_kem/kyber768/aarch64/symmetric-shake.c index 14a4c28c..067922ec 100644 --- a/crypto_kem/kyber768/aarch64/symmetric-shake.c +++ b/crypto_kem/kyber768/aarch64/symmetric-shake.c @@ -55,8 +55,6 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM shake256(out, outlen, extkey, sizeof(extkey)); } - - /************************************************* * Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf * diff --git a/crypto_kem/kyber768/aarch64/symmetric.h b/crypto_kem/kyber768/aarch64/symmetric.h index 55c0fdd9..bbd137b8 100644 --- a/crypto_kem/kyber768/aarch64/symmetric.h +++ b/crypto_kem/kyber768/aarch64/symmetric.h @@ -27,7 +27,6 @@ void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYM #define kyber_shake256_rkprf KYBER_NAMESPACE(kyber_shake256_rkprf) void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]); - #define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) @@ -37,7 +36,6 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) - // NEON Definition #include "fips202x2.h" @@ -67,5 +65,3 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, shake128x2_squeezeblocks(OUT0, OUT1, OUTBLOCKS, STATE) #endif /* SYMMETRIC_H */ - - From b5f7f97e0b441c2ac127c65f73f388fe2a596417 Mon Sep 17 00:00:00 2001 From: Thom Wiggers Date: Tue, 2 Jan 2024 12:34:34 +0100 Subject: [PATCH 77/85] Fix inclusion of Keccak2x --- crypto_kem/kyber1024/aarch64/Makefile | 12 +- crypto_kem/kyber1024/aarch64/fips202x2.c | 685 ----------------------- crypto_kem/kyber1024/aarch64/symmetric.h | 4 +- crypto_kem/kyber512/aarch64/Makefile | 12 +- crypto_kem/kyber512/aarch64/fips202x2.c | 685 ----------------------- crypto_kem/kyber512/aarch64/symmetric.h | 4 +- crypto_kem/kyber768/aarch64/Makefile | 12 +- crypto_kem/kyber768/aarch64/symmetric.h | 4 +- 8 files changed, 36 insertions(+), 1382 deletions(-) delete mode 100644 crypto_kem/kyber1024/aarch64/fips202x2.c delete mode 100644 crypto_kem/kyber512/aarch64/fips202x2.c diff --git a/crypto_kem/kyber1024/aarch64/Makefile b/crypto_kem/kyber1024/aarch64/Makefile index 21ba6461..5642227d 100644 --- a/crypto_kem/kyber1024/aarch64/Makefile +++ b/crypto_kem/kyber1024/aarch64/Makefile @@ -6,6 +6,10 @@ OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o n CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o +KECCAK2X=$(KECCAK2XDIR)/$(KECCAK2XOBJ) + all: $(LIB) %.o: %.c $(HEADERS) @@ -14,8 +18,12 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) + clean: $(RM) $(OBJECTS) diff --git a/crypto_kem/kyber1024/aarch64/fips202x2.c b/crypto_kem/kyber1024/aarch64/fips202x2.c deleted file mode 100644 index 054c3140..00000000 --- a/crypto_kem/kyber1024/aarch64/fips202x2.c +++ /dev/null @@ -1,685 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We offer - * CC0 1.0 Universal or the following MIT License for this file. - * You may freely choose one of them that applies. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - out = vshlq_n_u64(a, offset); \ - out = vsriq_n_u64(out, a, 64 - offset); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - out = veorq_u64(a, b); \ - out = veorq_u64(out, c); \ - out = veorq_u64(out, d); \ - out = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - out = vbicq_u64(c, b); \ - out = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/crypto_kem/kyber1024/aarch64/symmetric.h b/crypto_kem/kyber1024/aarch64/symmetric.h index 1eb92e61..0a8b8f8a 100644 --- a/crypto_kem/kyber1024/aarch64/symmetric.h +++ b/crypto_kem/kyber1024/aarch64/symmetric.h @@ -12,7 +12,7 @@ #include #include "params.h" -#include "keccak2x/fips202.h" +#include "fips202.h" typedef shake128ctx xof_state; @@ -37,7 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) // NEON Definition -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" typedef keccakx2_state neon_xof_state; diff --git a/crypto_kem/kyber512/aarch64/Makefile b/crypto_kem/kyber512/aarch64/Makefile index 29330759..7ac8ffe1 100644 --- a/crypto_kem/kyber512/aarch64/Makefile +++ b/crypto_kem/kyber512/aarch64/Makefile @@ -6,6 +6,10 @@ OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o n CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o +KECCAK2X=$(KECCAK2XDIR)/$(KECCAK2XOBJ) + all: $(LIB) %.o: %.c $(HEADERS) @@ -14,8 +18,12 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) + clean: $(RM) $(OBJECTS) diff --git a/crypto_kem/kyber512/aarch64/fips202x2.c b/crypto_kem/kyber512/aarch64/fips202x2.c deleted file mode 100644 index 054c3140..00000000 --- a/crypto_kem/kyber512/aarch64/fips202x2.c +++ /dev/null @@ -1,685 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We offer - * CC0 1.0 Universal or the following MIT License for this file. - * You may freely choose one of them that applies. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - out = vshlq_n_u64(a, offset); \ - out = vsriq_n_u64(out, a, 64 - offset); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - out = veorq_u64(a, b); \ - out = veorq_u64(out, c); \ - out = veorq_u64(out, d); \ - out = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - out = vbicq_u64(c, b); \ - out = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/crypto_kem/kyber512/aarch64/symmetric.h b/crypto_kem/kyber512/aarch64/symmetric.h index 3a6e0310..9019fd99 100644 --- a/crypto_kem/kyber512/aarch64/symmetric.h +++ b/crypto_kem/kyber512/aarch64/symmetric.h @@ -12,7 +12,7 @@ #include #include "params.h" -#include "keccak2x/fips202.h" +#include "fips202.h" typedef shake128ctx xof_state; @@ -37,7 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) // NEON Definition -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" typedef keccakx2_state neon_xof_state; diff --git a/crypto_kem/kyber768/aarch64/Makefile b/crypto_kem/kyber768/aarch64/Makefile index 26a76ea5..bbde4361 100644 --- a/crypto_kem/kyber768/aarch64/Makefile +++ b/crypto_kem/kyber768/aarch64/Makefile @@ -6,6 +6,10 @@ OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o n CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o +KECCAK2X=$(KECCAK2XDIR)/$(KECCAK2XOBJ) + all: $(LIB) %.o: %.c $(HEADERS) @@ -14,8 +18,12 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) + clean: $(RM) $(OBJECTS) diff --git a/crypto_kem/kyber768/aarch64/symmetric.h b/crypto_kem/kyber768/aarch64/symmetric.h index bbd137b8..c88aac1a 100644 --- a/crypto_kem/kyber768/aarch64/symmetric.h +++ b/crypto_kem/kyber768/aarch64/symmetric.h @@ -12,7 +12,7 @@ #include #include "params.h" -#include "keccak2x/fips202.h" +#include "fips202.h" typedef shake128ctx xof_state; @@ -37,7 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) // NEON Definition -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" typedef keccakx2_state neon_xof_state; From 05df46936963e69f038375948c958e33d91e612d Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Thu, 25 Jan 2024 16:13:26 +0100 Subject: [PATCH 78/85] fix feat.S issues --- common/{ => keccak2x}/feat.S | 0 crypto_kem/kyber1024/aarch64/Makefile | 4 +- crypto_kem/kyber1024/aarch64/feat.S | 168 ----------------------- crypto_kem/kyber1024/aarch64/fips202x2.h | 59 -------- crypto_kem/kyber512/aarch64/Makefile | 4 +- crypto_kem/kyber512/aarch64/feat.S | 168 ----------------------- crypto_kem/kyber512/aarch64/fips202x2.h | 59 -------- crypto_kem/kyber768/aarch64/Makefile | 4 +- 8 files changed, 6 insertions(+), 460 deletions(-) rename common/{ => keccak2x}/feat.S (100%) delete mode 100644 crypto_kem/kyber1024/aarch64/feat.S delete mode 100644 crypto_kem/kyber1024/aarch64/fips202x2.h delete mode 100644 crypto_kem/kyber512/aarch64/feat.S delete mode 100644 crypto_kem/kyber512/aarch64/fips202x2.h diff --git a/common/feat.S b/common/keccak2x/feat.S similarity index 100% rename from common/feat.S rename to common/keccak2x/feat.S diff --git a/crypto_kem/kyber1024/aarch64/Makefile b/crypto_kem/kyber1024/aarch64/Makefile index 5642227d..2482270e 100644 --- a/crypto_kem/kyber1024/aarch64/Makefile +++ b/crypto_kem/kyber1024/aarch64/Makefile @@ -7,8 +7,8 @@ OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o n CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) KECCAK2XDIR=../../../common/keccak2x -KECCAK2XOBJ=fips202x2.o -KECCAK2X=$(KECCAK2XDIR)/$(KECCAK2XOBJ) +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) all: $(LIB) diff --git a/crypto_kem/kyber1024/aarch64/feat.S b/crypto_kem/kyber1024/aarch64/feat.S deleted file mode 100644 index 6c8e60be..00000000 --- a/crypto_kem/kyber1024/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global f1600x2 -.global _f1600x2 -f1600x2: -_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/crypto_kem/kyber1024/aarch64/fips202x2.h b/crypto_kem/kyber1024/aarch64/fips202x2.h deleted file mode 100644 index 3066c52b..00000000 --- a/crypto_kem/kyber1024/aarch64/fips202x2.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#endif diff --git a/crypto_kem/kyber512/aarch64/Makefile b/crypto_kem/kyber512/aarch64/Makefile index 7ac8ffe1..40cc55b1 100644 --- a/crypto_kem/kyber512/aarch64/Makefile +++ b/crypto_kem/kyber512/aarch64/Makefile @@ -7,8 +7,8 @@ OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o n CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) KECCAK2XDIR=../../../common/keccak2x -KECCAK2XOBJ=fips202x2.o -KECCAK2X=$(KECCAK2XDIR)/$(KECCAK2XOBJ) +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) all: $(LIB) diff --git a/crypto_kem/kyber512/aarch64/feat.S b/crypto_kem/kyber512/aarch64/feat.S deleted file mode 100644 index 6c8e60be..00000000 --- a/crypto_kem/kyber512/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global f1600x2 -.global _f1600x2 -f1600x2: -_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/crypto_kem/kyber512/aarch64/fips202x2.h b/crypto_kem/kyber512/aarch64/fips202x2.h deleted file mode 100644 index 3066c52b..00000000 --- a/crypto_kem/kyber512/aarch64/fips202x2.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#endif diff --git a/crypto_kem/kyber768/aarch64/Makefile b/crypto_kem/kyber768/aarch64/Makefile index bbde4361..b0876efa 100644 --- a/crypto_kem/kyber768/aarch64/Makefile +++ b/crypto_kem/kyber768/aarch64/Makefile @@ -7,8 +7,8 @@ OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o n CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) KECCAK2XDIR=../../../common/keccak2x -KECCAK2XOBJ=fips202x2.o -KECCAK2X=$(KECCAK2XDIR)/$(KECCAK2XOBJ) +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) all: $(LIB) From 7bbdebe0ac9f77a118b10030effe9781b11b554f Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Thu, 25 Jan 2024 16:40:16 +0100 Subject: [PATCH 79/85] rm fips202x2 in Dilithium --- crypto_kem/kyber1024/aarch64/Makefile | 1 - crypto_kem/kyber768/aarch64/Makefile | 1 - crypto_sign/dilithium2/aarch64/Makefile | 14 +- crypto_sign/dilithium2/aarch64/feat.S | 168 ----- crypto_sign/dilithium2/aarch64/fips202x2.c | 686 --------------------- crypto_sign/dilithium2/aarch64/fips202x2.h | 59 -- crypto_sign/dilithium2/aarch64/ntt.c | 4 +- crypto_sign/dilithium2/aarch64/ntt.h | 1 - crypto_sign/dilithium2/aarch64/params.h | 3 - crypto_sign/dilithium2/aarch64/poly.c | 14 +- crypto_sign/dilithium2/aarch64/polyvec.h | 3 - crypto_sign/dilithium2/aarch64/rounding.c | 3 - crypto_sign/dilithium2/aarch64/sign.c | 3 +- crypto_sign/dilithium2/aarch64/sign.h | 1 - crypto_sign/dilithium2/aarch64/symmetric.h | 3 +- crypto_sign/dilithium3/aarch64/Makefile | 11 +- crypto_sign/dilithium3/aarch64/feat.S | 168 ----- crypto_sign/dilithium3/aarch64/fips202x2.c | 686 --------------------- crypto_sign/dilithium3/aarch64/fips202x2.h | 59 -- crypto_sign/dilithium3/aarch64/ntt.c | 4 +- crypto_sign/dilithium3/aarch64/ntt.h | 1 - crypto_sign/dilithium3/aarch64/params.h | 3 - crypto_sign/dilithium3/aarch64/poly.c | 16 +- crypto_sign/dilithium3/aarch64/polyvec.h | 3 - crypto_sign/dilithium3/aarch64/rounding.c | 3 - crypto_sign/dilithium3/aarch64/sign.c | 3 +- crypto_sign/dilithium3/aarch64/sign.h | 1 - crypto_sign/dilithium3/aarch64/symmetric.h | 3 +- crypto_sign/dilithium5/aarch64/Makefile | 11 +- crypto_sign/dilithium5/aarch64/feat.S | 168 ----- crypto_sign/dilithium5/aarch64/fips202x2.c | 686 --------------------- crypto_sign/dilithium5/aarch64/fips202x2.h | 59 -- crypto_sign/dilithium5/aarch64/ntt.c | 4 +- crypto_sign/dilithium5/aarch64/ntt.h | 1 - crypto_sign/dilithium5/aarch64/params.h | 3 - crypto_sign/dilithium5/aarch64/poly.c | 16 +- crypto_sign/dilithium5/aarch64/polyvec.h | 3 - crypto_sign/dilithium5/aarch64/rounding.c | 3 - crypto_sign/dilithium5/aarch64/sign.c | 3 +- crypto_sign/dilithium5/aarch64/sign.h | 1 - crypto_sign/dilithium5/aarch64/symmetric.h | 3 +- 41 files changed, 46 insertions(+), 2840 deletions(-) delete mode 100644 crypto_sign/dilithium2/aarch64/feat.S delete mode 100644 crypto_sign/dilithium2/aarch64/fips202x2.c delete mode 100644 crypto_sign/dilithium2/aarch64/fips202x2.h delete mode 100644 crypto_sign/dilithium3/aarch64/feat.S delete mode 100644 crypto_sign/dilithium3/aarch64/fips202x2.c delete mode 100644 crypto_sign/dilithium3/aarch64/fips202x2.h delete mode 100644 crypto_sign/dilithium5/aarch64/feat.S delete mode 100644 crypto_sign/dilithium5/aarch64/fips202x2.c delete mode 100644 crypto_sign/dilithium5/aarch64/fips202x2.h diff --git a/crypto_kem/kyber1024/aarch64/Makefile b/crypto_kem/kyber1024/aarch64/Makefile index 2482270e..9062653c 100644 --- a/crypto_kem/kyber1024/aarch64/Makefile +++ b/crypto_kem/kyber1024/aarch64/Makefile @@ -24,7 +24,6 @@ $(LIB): $(OBJECTS) $(KECCAK2X) $(KECCAK2X): $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) - clean: $(RM) $(OBJECTS) $(RM) $(LIB) diff --git a/crypto_kem/kyber768/aarch64/Makefile b/crypto_kem/kyber768/aarch64/Makefile index b0876efa..aa78d6da 100644 --- a/crypto_kem/kyber768/aarch64/Makefile +++ b/crypto_kem/kyber768/aarch64/Makefile @@ -24,7 +24,6 @@ $(LIB): $(OBJECTS) $(KECCAK2X) $(KECCAK2X): $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) - clean: $(RM) $(OBJECTS) $(RM) $(LIB) diff --git a/crypto_sign/dilithium2/aarch64/Makefile b/crypto_sign/dilithium2/aarch64/Makefile index c443defa..6f5f550d 100644 --- a/crypto_sign/dilithium2/aarch64/Makefile +++ b/crypto_sign/dilithium2/aarch64/Makefile @@ -4,8 +4,11 @@ LIB=libdilithium2_aarch64.a HEADERS=api.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) all: $(LIB) @@ -15,9 +18,14 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) $(HEADERS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) $(RM) $(LIB) + + diff --git a/crypto_sign/dilithium2/aarch64/feat.S b/crypto_sign/dilithium2/aarch64/feat.S deleted file mode 100644 index 6c8e60be..00000000 --- a/crypto_sign/dilithium2/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global f1600x2 -.global _f1600x2 -f1600x2: -_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/crypto_sign/dilithium2/aarch64/fips202x2.c b/crypto_sign/dilithium2/aarch64/fips202x2.c deleted file mode 100644 index c8ebcd36..00000000 --- a/crypto_sign/dilithium2/aarch64/fips202x2.c +++ /dev/null @@ -1,686 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We offer - * CC0 1.0 Universal or the following MIT License for this file. - * You may freely choose one of them that applies. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - out = vshlq_n_u64(a, offset); \ - out = vsriq_n_u64(out, a, 64 - offset); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - out = veorq_u64(a, b); \ - out = veorq_u64(out, c); \ - out = veorq_u64(out, d); \ - out = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - out = vbicq_u64(c, b); \ - out = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/crypto_sign/dilithium2/aarch64/fips202x2.h b/crypto_sign/dilithium2/aarch64/fips202x2.h deleted file mode 100644 index 3066c52b..00000000 --- a/crypto_sign/dilithium2/aarch64/fips202x2.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#endif diff --git a/crypto_sign/dilithium2/aarch64/ntt.c b/crypto_sign/dilithium2/aarch64/ntt.c index 92d92313..8f1a182e 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.c +++ b/crypto_sign/dilithium2/aarch64/ntt.c @@ -48,11 +48,11 @@ const __attribute__ ((aligned (16)))int32_t constants[16] = { }; const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { -0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 + 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 }; const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { -0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 + 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 }; /************************************************* diff --git a/crypto_sign/dilithium2/aarch64/ntt.h b/crypto_sign/dilithium2/aarch64/ntt.h index 497330a7..dbfee936 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.h +++ b/crypto_sign/dilithium2/aarch64/ntt.h @@ -68,5 +68,4 @@ void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); - #endif diff --git a/crypto_sign/dilithium2/aarch64/params.h b/crypto_sign/dilithium2/aarch64/params.h index 2eca982f..287ca9dd 100644 --- a/crypto_sign/dilithium2/aarch64/params.h +++ b/crypto_sign/dilithium2/aarch64/params.h @@ -25,7 +25,6 @@ #define D 13 #define ROOT_OF_UNITY 1753 - #define K 4 #define L 4 #define ETA 2 @@ -37,8 +36,6 @@ #define CRYPTO_ALGNAME "Dilithium2" #define CTILDEBYTES 32 - - #define POLYT1_PACKEDBYTES 320 #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) diff --git a/crypto_sign/dilithium2/aarch64/poly.c b/crypto_sign/dilithium2/aarch64/poly.c index 05dc1710..613a1309 100644 --- a/crypto_sign/dilithium2/aarch64/poly.c +++ b/crypto_sign/dilithium2/aarch64/poly.c @@ -39,7 +39,7 @@ #include "symmetric.h" #include -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "ntt.h" @@ -467,7 +467,6 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - if (t0 < 15) { t0 = t0 - (205 * t0 >> 10) * 5; a[ctr++] = 2 - t0; @@ -477,7 +476,6 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 2 - t1; } - } DBENCH_STOP(*tsample); @@ -642,7 +640,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); - for (i = 0; i < N / 8; ++i) { t[0] = ETA - a->coeffs[8 * i + 0]; t[1] = ETA - a->coeffs[8 * i + 1]; @@ -658,7 +655,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } - DBENCH_STOP(*tpack); } @@ -674,7 +670,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 8; ++i) { r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; @@ -695,7 +690,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } - DBENCH_STOP(*tpack); } @@ -868,7 +862,6 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); - for (i = 0; i < N / 4; ++i) { t[0] = GAMMA1 - a->coeffs[4 * i + 0]; t[1] = GAMMA1 - a->coeffs[4 * i + 1]; @@ -889,7 +882,6 @@ void polyz_pack(uint8_t *r, const poly *a) { r[9 * i + 8] = t[3] >> 10; } - DBENCH_STOP(*tpack); } @@ -906,7 +898,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 4; ++i) { r->coeffs[4 * i + 0] = a[9 * i + 0]; r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; @@ -934,7 +925,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; } - DBENCH_STOP(*tpack); } @@ -952,7 +942,6 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 4; ++i) { r[3 * i + 0] = a->coeffs[4 * i + 0]; r[3 * i + 0] |= a->coeffs[4 * i + 1] << 6; @@ -962,6 +951,5 @@ void polyw1_pack(uint8_t *r, const poly *a) { r[3 * i + 2] |= a->coeffs[4 * i + 3] << 2; } - DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium2/aarch64/polyvec.h b/crypto_sign/dilithium2/aarch64/polyvec.h index 8d8905bc..3e745866 100644 --- a/crypto_sign/dilithium2/aarch64/polyvec.h +++ b/crypto_sign/dilithium2/aarch64/polyvec.h @@ -42,12 +42,9 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v); - #define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) int polyvecl_chknorm(const polyvecl *v, int32_t B); - - /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; diff --git a/crypto_sign/dilithium2/aarch64/rounding.c b/crypto_sign/dilithium2/aarch64/rounding.c index 342e9683..1ee5075a 100644 --- a/crypto_sign/dilithium2/aarch64/rounding.c +++ b/crypto_sign/dilithium2/aarch64/rounding.c @@ -51,7 +51,6 @@ int32_t decompose(int32_t *a0, int32_t a) { a1 = (a1 * 11275 + (1 << 23)) >> 24; a1 ^= ((43 - a1) >> 31) & a1; - *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; return a1; @@ -94,12 +93,10 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } - if (a0 > 0) { return (a1 == 43) ? 0 : a1 + 1; } else { return (a1 == 0) ? 43 : a1 - 1; } - } diff --git a/crypto_sign/dilithium2/aarch64/sign.c b/crypto_sign/dilithium2/aarch64/sign.c index ad2fd833..ca92eb18 100644 --- a/crypto_sign/dilithium2/aarch64/sign.c +++ b/crypto_sign/dilithium2/aarch64/sign.c @@ -140,8 +140,7 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - - for(n = 0; n < RNDBYTES; n++) { + for (n = 0; n < RNDBYTES; n++) { rnd[n] = 0; } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); diff --git a/crypto_sign/dilithium2/aarch64/sign.h b/crypto_sign/dilithium2/aarch64/sign.h index 1a9dbea5..8b8a5283 100644 --- a/crypto_sign/dilithium2/aarch64/sign.h +++ b/crypto_sign/dilithium2/aarch64/sign.h @@ -13,7 +13,6 @@ #include #include - #define challenge DILITHIUM_NAMESPACE(challenge) void challenge(poly *c, const uint8_t seed[SEEDBYTES]); diff --git a/crypto_sign/dilithium2/aarch64/symmetric.h b/crypto_sign/dilithium2/aarch64/symmetric.h index 81b7f1a9..7d3aa1a7 100644 --- a/crypto_sign/dilithium2/aarch64/symmetric.h +++ b/crypto_sign/dilithium2/aarch64/symmetric.h @@ -34,7 +34,7 @@ */ #include "fips202.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "params.h" #include @@ -60,7 +60,6 @@ void dilithium_shake256x2_stream_init(keccakx2_state *state, const uint8_t seed[CRHBYTES], uint16_t nonce1, uint16_t nonce2); - #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE diff --git a/crypto_sign/dilithium3/aarch64/Makefile b/crypto_sign/dilithium3/aarch64/Makefile index ef1c358a..f1b97b4f 100644 --- a/crypto_sign/dilithium3/aarch64/Makefile +++ b/crypto_sign/dilithium3/aarch64/Makefile @@ -7,6 +7,10 @@ OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-s CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) + all: $(LIB) %.o: %.c $(HEADERS) @@ -15,8 +19,11 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) $(HEADERS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) diff --git a/crypto_sign/dilithium3/aarch64/feat.S b/crypto_sign/dilithium3/aarch64/feat.S deleted file mode 100644 index 6c8e60be..00000000 --- a/crypto_sign/dilithium3/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global f1600x2 -.global _f1600x2 -f1600x2: -_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/crypto_sign/dilithium3/aarch64/fips202x2.c b/crypto_sign/dilithium3/aarch64/fips202x2.c deleted file mode 100644 index c8ebcd36..00000000 --- a/crypto_sign/dilithium3/aarch64/fips202x2.c +++ /dev/null @@ -1,686 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We offer - * CC0 1.0 Universal or the following MIT License for this file. - * You may freely choose one of them that applies. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - out = vshlq_n_u64(a, offset); \ - out = vsriq_n_u64(out, a, 64 - offset); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - out = veorq_u64(a, b); \ - out = veorq_u64(out, c); \ - out = veorq_u64(out, d); \ - out = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - out = vbicq_u64(c, b); \ - out = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/crypto_sign/dilithium3/aarch64/fips202x2.h b/crypto_sign/dilithium3/aarch64/fips202x2.h deleted file mode 100644 index 3066c52b..00000000 --- a/crypto_sign/dilithium3/aarch64/fips202x2.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#endif diff --git a/crypto_sign/dilithium3/aarch64/ntt.c b/crypto_sign/dilithium3/aarch64/ntt.c index 92d92313..8f1a182e 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.c +++ b/crypto_sign/dilithium3/aarch64/ntt.c @@ -48,11 +48,11 @@ const __attribute__ ((aligned (16)))int32_t constants[16] = { }; const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { -0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 + 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 }; const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { -0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 + 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 }; /************************************************* diff --git a/crypto_sign/dilithium3/aarch64/ntt.h b/crypto_sign/dilithium3/aarch64/ntt.h index 25b2d95d..50894adb 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.h +++ b/crypto_sign/dilithium3/aarch64/ntt.h @@ -68,5 +68,4 @@ void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); - #endif diff --git a/crypto_sign/dilithium3/aarch64/params.h b/crypto_sign/dilithium3/aarch64/params.h index 52803392..b5fe91d2 100644 --- a/crypto_sign/dilithium3/aarch64/params.h +++ b/crypto_sign/dilithium3/aarch64/params.h @@ -25,7 +25,6 @@ #define D 13 #define ROOT_OF_UNITY 1753 - #define K 6 #define L 5 #define ETA 4 @@ -37,8 +36,6 @@ #define CRYPTO_ALGNAME "Dilithium3" #define CTILDEBYTES 48 - - #define POLYT1_PACKEDBYTES 320 #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) diff --git a/crypto_sign/dilithium3/aarch64/poly.c b/crypto_sign/dilithium3/aarch64/poly.c index 5399da4b..ce46fab0 100644 --- a/crypto_sign/dilithium3/aarch64/poly.c +++ b/crypto_sign/dilithium3/aarch64/poly.c @@ -37,11 +37,11 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" +#include "keccak2x/fips202x2.h" +#include "ntt.h" #include -#include "fips202x2.h" -#include "ntt.h" #define DBENCH_START() #define DBENCH_STOP(t) @@ -467,7 +467,6 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - if (t0 < 9) { a[ctr++] = 4 - t0; } @@ -475,7 +474,6 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 4 - t1; } - } DBENCH_STOP(*tsample); @@ -640,14 +638,12 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { t[0] = ETA - a->coeffs[2 * i + 0]; t[1] = ETA - a->coeffs[2 * i + 1]; r[i] = t[0] | (t[1] << 4); } - DBENCH_STOP(*tpack); } @@ -663,7 +659,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[i] & 0x0F; r->coeffs[2 * i + 1] = a[i] >> 4; @@ -671,7 +666,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; } - DBENCH_STOP(*tpack); } @@ -844,7 +838,6 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { t[0] = GAMMA1 - a->coeffs[2 * i + 0]; t[1] = GAMMA1 - a->coeffs[2 * i + 1]; @@ -857,7 +850,6 @@ void polyz_pack(uint8_t *r, const poly *a) { r[5 * i + 4] = t[1] >> 12; } - DBENCH_STOP(*tpack); } @@ -874,7 +866,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; @@ -890,7 +881,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; } - DBENCH_STOP(*tpack); } @@ -908,11 +898,9 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); } - DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium3/aarch64/polyvec.h b/crypto_sign/dilithium3/aarch64/polyvec.h index fe78217d..ad4e36ab 100644 --- a/crypto_sign/dilithium3/aarch64/polyvec.h +++ b/crypto_sign/dilithium3/aarch64/polyvec.h @@ -42,12 +42,9 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v); - #define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) int polyvecl_chknorm(const polyvecl *v, int32_t B); - - /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; diff --git a/crypto_sign/dilithium3/aarch64/rounding.c b/crypto_sign/dilithium3/aarch64/rounding.c index b0068bd4..c0143277 100644 --- a/crypto_sign/dilithium3/aarch64/rounding.c +++ b/crypto_sign/dilithium3/aarch64/rounding.c @@ -51,7 +51,6 @@ int32_t decompose(int32_t *a0, int32_t a) { a1 = (a1 * 1025 + (1 << 21)) >> 22; a1 &= 15; - *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; return a1; @@ -94,12 +93,10 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } - if (a0 > 0) { return (a1 + 1) & 15; } else { return (a1 - 1) & 15; } - } diff --git a/crypto_sign/dilithium3/aarch64/sign.c b/crypto_sign/dilithium3/aarch64/sign.c index 5eb6dee8..86c958b4 100644 --- a/crypto_sign/dilithium3/aarch64/sign.c +++ b/crypto_sign/dilithium3/aarch64/sign.c @@ -140,8 +140,7 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - - for(n = 0; n < RNDBYTES; n++) { + for (n = 0; n < RNDBYTES; n++) { rnd[n] = 0; } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); diff --git a/crypto_sign/dilithium3/aarch64/sign.h b/crypto_sign/dilithium3/aarch64/sign.h index 97c60f3d..0759909c 100644 --- a/crypto_sign/dilithium3/aarch64/sign.h +++ b/crypto_sign/dilithium3/aarch64/sign.h @@ -13,7 +13,6 @@ #include #include - #define challenge DILITHIUM_NAMESPACE(challenge) void challenge(poly *c, const uint8_t seed[SEEDBYTES]); diff --git a/crypto_sign/dilithium3/aarch64/symmetric.h b/crypto_sign/dilithium3/aarch64/symmetric.h index cf7ff128..d9551aba 100644 --- a/crypto_sign/dilithium3/aarch64/symmetric.h +++ b/crypto_sign/dilithium3/aarch64/symmetric.h @@ -34,7 +34,7 @@ */ #include "fips202.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "params.h" #include @@ -60,7 +60,6 @@ void dilithium_shake256x2_stream_init(keccakx2_state *state, const uint8_t seed[CRHBYTES], uint16_t nonce1, uint16_t nonce2); - #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE diff --git a/crypto_sign/dilithium5/aarch64/Makefile b/crypto_sign/dilithium5/aarch64/Makefile index 006d82b2..a52fece1 100644 --- a/crypto_sign/dilithium5/aarch64/Makefile +++ b/crypto_sign/dilithium5/aarch64/Makefile @@ -7,6 +7,10 @@ OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-s CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) + all: $(LIB) %.o: %.c $(HEADERS) @@ -15,8 +19,11 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) $(HEADERS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) diff --git a/crypto_sign/dilithium5/aarch64/feat.S b/crypto_sign/dilithium5/aarch64/feat.S deleted file mode 100644 index 6c8e60be..00000000 --- a/crypto_sign/dilithium5/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global f1600x2 -.global _f1600x2 -f1600x2: -_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/crypto_sign/dilithium5/aarch64/fips202x2.c b/crypto_sign/dilithium5/aarch64/fips202x2.c deleted file mode 100644 index c8ebcd36..00000000 --- a/crypto_sign/dilithium5/aarch64/fips202x2.c +++ /dev/null @@ -1,686 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We offer - * CC0 1.0 Universal or the following MIT License for this file. - * You may freely choose one of them that applies. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - out = vshlq_n_u64(a, offset); \ - out = vsriq_n_u64(out, a, 64 - offset); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - out = veorq_u64(a, b); \ - out = veorq_u64(out, c); \ - out = veorq_u64(out, d); \ - out = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - out = vbicq_u64(c, b); \ - out = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/crypto_sign/dilithium5/aarch64/fips202x2.h b/crypto_sign/dilithium5/aarch64/fips202x2.h deleted file mode 100644 index 3066c52b..00000000 --- a/crypto_sign/dilithium5/aarch64/fips202x2.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#endif diff --git a/crypto_sign/dilithium5/aarch64/ntt.c b/crypto_sign/dilithium5/aarch64/ntt.c index 92d92313..8f1a182e 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.c +++ b/crypto_sign/dilithium5/aarch64/ntt.c @@ -48,11 +48,11 @@ const __attribute__ ((aligned (16)))int32_t constants[16] = { }; const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { -0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 + 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 }; const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { -0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 + 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 }; /************************************************* diff --git a/crypto_sign/dilithium5/aarch64/ntt.h b/crypto_sign/dilithium5/aarch64/ntt.h index d3c7b756..204626e7 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.h +++ b/crypto_sign/dilithium5/aarch64/ntt.h @@ -68,5 +68,4 @@ void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); - #endif diff --git a/crypto_sign/dilithium5/aarch64/params.h b/crypto_sign/dilithium5/aarch64/params.h index e86fc797..71681fd8 100644 --- a/crypto_sign/dilithium5/aarch64/params.h +++ b/crypto_sign/dilithium5/aarch64/params.h @@ -25,7 +25,6 @@ #define D 13 #define ROOT_OF_UNITY 1753 - #define K 8 #define L 7 #define ETA 2 @@ -37,8 +36,6 @@ #define CRYPTO_ALGNAME "Dilithium5" #define CTILDEBYTES 64 - - #define POLYT1_PACKEDBYTES 320 #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) diff --git a/crypto_sign/dilithium5/aarch64/poly.c b/crypto_sign/dilithium5/aarch64/poly.c index ad3b1109..749bc568 100644 --- a/crypto_sign/dilithium5/aarch64/poly.c +++ b/crypto_sign/dilithium5/aarch64/poly.c @@ -37,11 +37,11 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" +#include "keccak2x/fips202x2.h" +#include "ntt.h" #include -#include "fips202x2.h" -#include "ntt.h" #define DBENCH_START() #define DBENCH_STOP(t) @@ -467,7 +467,6 @@ static unsigned int rej_eta(int32_t *a, t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - if (t0 < 15) { t0 = t0 - (205 * t0 >> 10) * 5; a[ctr++] = 2 - t0; @@ -477,7 +476,6 @@ static unsigned int rej_eta(int32_t *a, a[ctr++] = 2 - t1; } - } DBENCH_STOP(*tsample); @@ -642,7 +640,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { uint8_t t[8]; DBENCH_START(); - for (i = 0; i < N / 8; ++i) { t[0] = ETA - a->coeffs[8 * i + 0]; t[1] = ETA - a->coeffs[8 * i + 1]; @@ -658,7 +655,6 @@ void polyeta_pack(uint8_t *r, const poly *a) { r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } - DBENCH_STOP(*tpack); } @@ -674,7 +670,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 8; ++i) { r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; @@ -695,7 +690,6 @@ void polyeta_unpack(poly *r, const uint8_t *a) { r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } - DBENCH_STOP(*tpack); } @@ -868,7 +862,6 @@ void polyz_pack(uint8_t *r, const poly *a) { uint32_t t[4]; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { t[0] = GAMMA1 - a->coeffs[2 * i + 0]; t[1] = GAMMA1 - a->coeffs[2 * i + 1]; @@ -881,7 +874,6 @@ void polyz_pack(uint8_t *r, const poly *a) { r[5 * i + 4] = t[1] >> 12; } - DBENCH_STOP(*tpack); } @@ -898,7 +890,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; @@ -914,7 +905,6 @@ void polyz_unpack(poly *r, const uint8_t *a) { r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; } - DBENCH_STOP(*tpack); } @@ -932,11 +922,9 @@ void polyw1_pack(uint8_t *r, const poly *a) { unsigned int i; DBENCH_START(); - for (i = 0; i < N / 2; ++i) { r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); } - DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium5/aarch64/polyvec.h b/crypto_sign/dilithium5/aarch64/polyvec.h index a3130785..d67f7590 100644 --- a/crypto_sign/dilithium5/aarch64/polyvec.h +++ b/crypto_sign/dilithium5/aarch64/polyvec.h @@ -42,12 +42,9 @@ void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v); - #define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) int polyvecl_chknorm(const polyvecl *v, int32_t B); - - /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; diff --git a/crypto_sign/dilithium5/aarch64/rounding.c b/crypto_sign/dilithium5/aarch64/rounding.c index b0068bd4..c0143277 100644 --- a/crypto_sign/dilithium5/aarch64/rounding.c +++ b/crypto_sign/dilithium5/aarch64/rounding.c @@ -51,7 +51,6 @@ int32_t decompose(int32_t *a0, int32_t a) { a1 = (a1 * 1025 + (1 << 21)) >> 22; a1 &= 15; - *a0 = a - a1 * 2 * GAMMA2; *a0 -= (((DILITHIUM_Q - 1) / 2 - *a0) >> 31) & DILITHIUM_Q; return a1; @@ -94,12 +93,10 @@ int32_t use_hint(int32_t a, unsigned int hint) { return a1; } - if (a0 > 0) { return (a1 + 1) & 15; } else { return (a1 - 1) & 15; } - } diff --git a/crypto_sign/dilithium5/aarch64/sign.c b/crypto_sign/dilithium5/aarch64/sign.c index 4b0be0f5..156b994f 100644 --- a/crypto_sign/dilithium5/aarch64/sign.c +++ b/crypto_sign/dilithium5/aarch64/sign.c @@ -140,8 +140,7 @@ int crypto_sign_signature(uint8_t *sig, shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - - for(n = 0; n < RNDBYTES; n++) { + for (n = 0; n < RNDBYTES; n++) { rnd[n] = 0; } shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); diff --git a/crypto_sign/dilithium5/aarch64/sign.h b/crypto_sign/dilithium5/aarch64/sign.h index 692f06d3..fbccb2c7 100644 --- a/crypto_sign/dilithium5/aarch64/sign.h +++ b/crypto_sign/dilithium5/aarch64/sign.h @@ -13,7 +13,6 @@ #include #include - #define challenge DILITHIUM_NAMESPACE(challenge) void challenge(poly *c, const uint8_t seed[SEEDBYTES]); diff --git a/crypto_sign/dilithium5/aarch64/symmetric.h b/crypto_sign/dilithium5/aarch64/symmetric.h index ecf767f7..1a2a89d8 100644 --- a/crypto_sign/dilithium5/aarch64/symmetric.h +++ b/crypto_sign/dilithium5/aarch64/symmetric.h @@ -34,7 +34,7 @@ */ #include "fips202.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "params.h" #include @@ -60,7 +60,6 @@ void dilithium_shake256x2_stream_init(keccakx2_state *state, const uint8_t seed[CRHBYTES], uint16_t nonce1, uint16_t nonce2); - #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE From 941f4e19725ca0d27c1a61121665df0d48cd7b39 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Thu, 25 Jan 2024 16:44:15 +0100 Subject: [PATCH 80/85] style --- crypto_sign/dilithium2/aarch64/ntt.c | 5 ++--- crypto_sign/dilithium2/aarch64/ntt.h | 2 +- crypto_sign/dilithium2/aarch64/poly.c | 4 +--- crypto_sign/dilithium2/aarch64/polyvec.c | 3 +-- crypto_sign/dilithium3/aarch64/ntt.c | 5 ++--- crypto_sign/dilithium3/aarch64/ntt.h | 2 +- crypto_sign/dilithium3/aarch64/poly.c | 2 -- crypto_sign/dilithium3/aarch64/polyvec.c | 3 +-- crypto_sign/dilithium5/aarch64/ntt.c | 5 ++--- crypto_sign/dilithium5/aarch64/ntt.h | 2 +- crypto_sign/dilithium5/aarch64/poly.c | 2 -- crypto_sign/dilithium5/aarch64/polyvec.c | 3 +-- 12 files changed, 13 insertions(+), 25 deletions(-) diff --git a/crypto_sign/dilithium2/aarch64/ntt.c b/crypto_sign/dilithium2/aarch64/ntt.c index 8f1a182e..ec594a77 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.c +++ b/crypto_sign/dilithium2/aarch64/ntt.c @@ -33,11 +33,10 @@ */ #include "params.h" -#include -#include - #include "NTT_params.h" #include "ntt.h" +#include +#include const __attribute__ ((aligned (16)))int32_t constants[16] = { Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, diff --git a/crypto_sign/dilithium2/aarch64/ntt.h b/crypto_sign/dilithium2/aarch64/ntt.h index dbfee936..060921bc 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.h +++ b/crypto_sign/dilithium2/aarch64/ntt.h @@ -34,9 +34,9 @@ * SOFTWARE. */ -#include #include "params.h" #include "NTT_params.h" +#include extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); diff --git a/crypto_sign/dilithium2/aarch64/poly.c b/crypto_sign/dilithium2/aarch64/poly.c index 613a1309..103f294e 100644 --- a/crypto_sign/dilithium2/aarch64/poly.c +++ b/crypto_sign/dilithium2/aarch64/poly.c @@ -37,11 +37,9 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" -#include - #include "keccak2x/fips202x2.h" - #include "ntt.h" +#include #define DBENCH_START() #define DBENCH_STOP(t) diff --git a/crypto_sign/dilithium2/aarch64/polyvec.c b/crypto_sign/dilithium2/aarch64/polyvec.c index b03dd1fa..019c0638 100644 --- a/crypto_sign/dilithium2/aarch64/polyvec.c +++ b/crypto_sign/dilithium2/aarch64/polyvec.c @@ -32,13 +32,12 @@ * SOFTWARE. */ -#include - #include "params.h" #include "poly.h" #include "polyvec.h" #include "ntt.h" #include "reduce.h" +#include /************************************************* * Name: expand_mat diff --git a/crypto_sign/dilithium3/aarch64/ntt.c b/crypto_sign/dilithium3/aarch64/ntt.c index 8f1a182e..ec594a77 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.c +++ b/crypto_sign/dilithium3/aarch64/ntt.c @@ -33,11 +33,10 @@ */ #include "params.h" -#include -#include - #include "NTT_params.h" #include "ntt.h" +#include +#include const __attribute__ ((aligned (16)))int32_t constants[16] = { Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, diff --git a/crypto_sign/dilithium3/aarch64/ntt.h b/crypto_sign/dilithium3/aarch64/ntt.h index 50894adb..3917786e 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.h +++ b/crypto_sign/dilithium3/aarch64/ntt.h @@ -34,9 +34,9 @@ * SOFTWARE. */ -#include #include "params.h" #include "NTT_params.h" +#include extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); diff --git a/crypto_sign/dilithium3/aarch64/poly.c b/crypto_sign/dilithium3/aarch64/poly.c index ce46fab0..554c5062 100644 --- a/crypto_sign/dilithium3/aarch64/poly.c +++ b/crypto_sign/dilithium3/aarch64/poly.c @@ -41,8 +41,6 @@ #include "ntt.h" #include - - #define DBENCH_START() #define DBENCH_STOP(t) diff --git a/crypto_sign/dilithium3/aarch64/polyvec.c b/crypto_sign/dilithium3/aarch64/polyvec.c index 6cd84c83..1a5d6aa9 100644 --- a/crypto_sign/dilithium3/aarch64/polyvec.c +++ b/crypto_sign/dilithium3/aarch64/polyvec.c @@ -32,13 +32,12 @@ * SOFTWARE. */ -#include - #include "params.h" #include "poly.h" #include "polyvec.h" #include "ntt.h" #include "reduce.h" +#include /************************************************* * Name: expand_mat diff --git a/crypto_sign/dilithium5/aarch64/ntt.c b/crypto_sign/dilithium5/aarch64/ntt.c index 8f1a182e..ec594a77 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.c +++ b/crypto_sign/dilithium5/aarch64/ntt.c @@ -33,11 +33,10 @@ */ #include "params.h" -#include -#include - #include "NTT_params.h" #include "ntt.h" +#include +#include const __attribute__ ((aligned (16)))int32_t constants[16] = { Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, diff --git a/crypto_sign/dilithium5/aarch64/ntt.h b/crypto_sign/dilithium5/aarch64/ntt.h index 204626e7..9d00a098 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.h +++ b/crypto_sign/dilithium5/aarch64/ntt.h @@ -34,9 +34,9 @@ * SOFTWARE. */ -#include #include "params.h" #include "NTT_params.h" +#include extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); diff --git a/crypto_sign/dilithium5/aarch64/poly.c b/crypto_sign/dilithium5/aarch64/poly.c index 749bc568..0d746a4c 100644 --- a/crypto_sign/dilithium5/aarch64/poly.c +++ b/crypto_sign/dilithium5/aarch64/poly.c @@ -41,8 +41,6 @@ #include "ntt.h" #include - - #define DBENCH_START() #define DBENCH_STOP(t) diff --git a/crypto_sign/dilithium5/aarch64/polyvec.c b/crypto_sign/dilithium5/aarch64/polyvec.c index 9218650c..ce4828e9 100644 --- a/crypto_sign/dilithium5/aarch64/polyvec.c +++ b/crypto_sign/dilithium5/aarch64/polyvec.c @@ -32,13 +32,12 @@ * SOFTWARE. */ -#include - #include "params.h" #include "poly.h" #include "polyvec.h" #include "ntt.h" #include "reduce.h" +#include /************************************************* * Name: expand_mat From 806afd14faa4ecefc525bd3d6a9c8946100778c9 Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Thu, 25 Jan 2024 17:42:28 +0100 Subject: [PATCH 81/85] fix consistency tests --- test/duplicate_consistency/dilithium2_aarch64.yml | 4 ---- test/duplicate_consistency/dilithium3_aarch64.yml | 4 ---- test/duplicate_consistency/dilithium5_aarch64.yml | 4 ---- test/duplicate_consistency/kyber1024_aarch64.yml | 4 ---- test/duplicate_consistency/kyber512_aarch64.yml | 4 ---- test/duplicate_consistency/kyber768_aarch64.yml | 4 ---- 6 files changed, 24 deletions(-) diff --git a/test/duplicate_consistency/dilithium2_aarch64.yml b/test/duplicate_consistency/dilithium2_aarch64.yml index 2d16a72a..30907666 100644 --- a/test/duplicate_consistency/dilithium2_aarch64.yml +++ b/test/duplicate_consistency/dilithium2_aarch64.yml @@ -6,8 +6,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h @@ -32,8 +30,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h diff --git a/test/duplicate_consistency/dilithium3_aarch64.yml b/test/duplicate_consistency/dilithium3_aarch64.yml index 09afb1ca..4723c4ee 100644 --- a/test/duplicate_consistency/dilithium3_aarch64.yml +++ b/test/duplicate_consistency/dilithium3_aarch64.yml @@ -6,8 +6,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h @@ -32,8 +30,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h diff --git a/test/duplicate_consistency/dilithium5_aarch64.yml b/test/duplicate_consistency/dilithium5_aarch64.yml index 486a4eba..29941f2e 100644 --- a/test/duplicate_consistency/dilithium5_aarch64.yml +++ b/test/duplicate_consistency/dilithium5_aarch64.yml @@ -6,8 +6,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h @@ -32,8 +30,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h diff --git a/test/duplicate_consistency/kyber1024_aarch64.yml b/test/duplicate_consistency/kyber1024_aarch64.yml index f356ea57..09ffc25e 100644 --- a/test/duplicate_consistency/kyber1024_aarch64.yml +++ b/test/duplicate_consistency/kyber1024_aarch64.yml @@ -7,8 +7,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc @@ -39,8 +37,6 @@ consistency_checks: - __asm_poly.S - cbd.c - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc diff --git a/test/duplicate_consistency/kyber512_aarch64.yml b/test/duplicate_consistency/kyber512_aarch64.yml index 1972d144..731a4728 100644 --- a/test/duplicate_consistency/kyber512_aarch64.yml +++ b/test/duplicate_consistency/kyber512_aarch64.yml @@ -7,8 +7,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc @@ -40,8 +38,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc diff --git a/test/duplicate_consistency/kyber768_aarch64.yml b/test/duplicate_consistency/kyber768_aarch64.yml index 4578fc68..fb830026 100644 --- a/test/duplicate_consistency/kyber768_aarch64.yml +++ b/test/duplicate_consistency/kyber768_aarch64.yml @@ -8,8 +8,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc @@ -42,8 +40,6 @@ consistency_checks: - __asm_poly.S - cbd.c - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc From 6c771bc4b24eea0f452649e84ed09ebfba12e0ea Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Thu, 25 Jan 2024 18:42:11 +0100 Subject: [PATCH 82/85] recover fix for the conversion warning --- crypto_sign/dilithium2/clean/sign.c | 2 +- crypto_sign/dilithium3/clean/sign.c | 2 +- crypto_sign/dilithium5/clean/sign.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto_sign/dilithium2/clean/sign.c b/crypto_sign/dilithium2/clean/sign.c index 93a137dc..8d04fefe 100644 --- a/crypto_sign/dilithium2/clean/sign.c +++ b/crypto_sign/dilithium2/clean/sign.c @@ -337,7 +337,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, badsig: /* Signature verification failed */ - *mlen = -1; + *mlen = (size_t) -1; for (i = 0; i < smlen; ++i) { m[i] = 0; } diff --git a/crypto_sign/dilithium3/clean/sign.c b/crypto_sign/dilithium3/clean/sign.c index 7a4f3d31..0f13be41 100644 --- a/crypto_sign/dilithium3/clean/sign.c +++ b/crypto_sign/dilithium3/clean/sign.c @@ -337,7 +337,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, badsig: /* Signature verification failed */ - *mlen = -1; + *mlen = (size_t) -1; for (i = 0; i < smlen; ++i) { m[i] = 0; } diff --git a/crypto_sign/dilithium5/clean/sign.c b/crypto_sign/dilithium5/clean/sign.c index 2524c470..d7a85ebf 100644 --- a/crypto_sign/dilithium5/clean/sign.c +++ b/crypto_sign/dilithium5/clean/sign.c @@ -337,7 +337,7 @@ int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_open(uint8_t *m, badsig: /* Signature verification failed */ - *mlen = -1; + *mlen = (size_t) -1; for (i = 0; i < smlen; ++i) { m[i] = 0; } From 553160de71a97e3e666a4e00eccbfc485df6eefd Mon Sep 17 00:00:00 2001 From: Thom Wiggers Date: Mon, 5 Feb 2024 08:19:58 +0100 Subject: [PATCH 83/85] Allow shake2x symbols to be not-namespaced --- test/test_symbol_namespace.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_symbol_namespace.py b/test/test_symbol_namespace.py index cf52833d..7bfa889e 100644 --- a/test/test_symbol_namespace.py +++ b/test/test_symbol_namespace.py @@ -48,6 +48,10 @@ def test_symbol_namespaces(implementation, impl_path, test_dir, init, destr): symbol.startswith('_KeccakF1600times4') or # MacOS symbol.startswith('KeccakP1600times4') or symbol.startswith('_KeccakP1600times4') or # MacOS + # shake2x + symbol.lstrip('_').startswith('f1600x2') or + symbol.lstrip('_').startswith('shake128x2') or + symbol.lstrip('_').startswith('shake256x2') or # weird things on i386 symbol.startswith('__x86.get_pc_thunk.')): non_namespaced.append(symbol) From ee71d2c823982bfcf54686f3cf1d666f396dc9aa Mon Sep 17 00:00:00 2001 From: vincentvbh Date: Mon, 5 Feb 2024 13:21:45 +0100 Subject: [PATCH 84/85] namespace tables of constants --- crypto_kem/kyber1024/aarch64/ntt.h | 6 +++++- crypto_kem/kyber512/aarch64/ntt.h | 6 +++++- crypto_kem/kyber768/aarch64/ntt.h | 6 +++++- crypto_sign/dilithium2/aarch64/ntt.h | 4 ++++ crypto_sign/dilithium3/aarch64/ntt.h | 4 ++++ crypto_sign/dilithium5/aarch64/ntt.h | 4 ++++ 6 files changed, 27 insertions(+), 3 deletions(-) diff --git a/crypto_kem/kyber1024/aarch64/ntt.h b/crypto_kem/kyber1024/aarch64/ntt.h index cd74b9a8..5eaa0bdb 100644 --- a/crypto_kem/kyber1024/aarch64/ntt.h +++ b/crypto_kem/kyber1024/aarch64/ntt.h @@ -35,7 +35,11 @@ #include "NTT_params.h" -extern const int16_t zetas[128]; +#define asymmetric_const KYBER_NAMESPACE(asymmetric_const) +#define constants KYBER_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define pre_asymmetric_table_Q1_extended KYBER_NAMESPACE(pre_asymmetric_table_Q1_extended) +#define streamlined_inv_GS_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_inv_GS_negacyclic_table_Q1_jump_extended) extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); diff --git a/crypto_kem/kyber512/aarch64/ntt.h b/crypto_kem/kyber512/aarch64/ntt.h index 43307eb2..795ebf3c 100644 --- a/crypto_kem/kyber512/aarch64/ntt.h +++ b/crypto_kem/kyber512/aarch64/ntt.h @@ -35,7 +35,11 @@ #include "NTT_params.h" -extern const int16_t zetas[128]; +#define asymmetric_const KYBER_NAMESPACE(asymmetric_const) +#define constants KYBER_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define pre_asymmetric_table_Q1_extended KYBER_NAMESPACE(pre_asymmetric_table_Q1_extended) +#define streamlined_inv_GS_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_inv_GS_negacyclic_table_Q1_jump_extended) extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); diff --git a/crypto_kem/kyber768/aarch64/ntt.h b/crypto_kem/kyber768/aarch64/ntt.h index 3cf77b53..59945023 100644 --- a/crypto_kem/kyber768/aarch64/ntt.h +++ b/crypto_kem/kyber768/aarch64/ntt.h @@ -35,7 +35,11 @@ #include "NTT_params.h" -extern const int16_t zetas[128]; +#define asymmetric_const KYBER_NAMESPACE(asymmetric_const) +#define constants KYBER_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define pre_asymmetric_table_Q1_extended KYBER_NAMESPACE(pre_asymmetric_table_Q1_extended) +#define streamlined_inv_GS_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_inv_GS_negacyclic_table_Q1_jump_extended) extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); diff --git a/crypto_sign/dilithium2/aarch64/ntt.h b/crypto_sign/dilithium2/aarch64/ntt.h index 060921bc..5c85aa48 100644 --- a/crypto_sign/dilithium2/aarch64/ntt.h +++ b/crypto_sign/dilithium2/aarch64/ntt.h @@ -38,6 +38,10 @@ #include "NTT_params.h" #include +#define constants DILITHIUM_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define streamlined_GS_itable_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_GS_itable_Q1_jump_extended) + extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); diff --git a/crypto_sign/dilithium3/aarch64/ntt.h b/crypto_sign/dilithium3/aarch64/ntt.h index 3917786e..ea338901 100644 --- a/crypto_sign/dilithium3/aarch64/ntt.h +++ b/crypto_sign/dilithium3/aarch64/ntt.h @@ -38,6 +38,10 @@ #include "NTT_params.h" #include +#define constants DILITHIUM_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define streamlined_GS_itable_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_GS_itable_Q1_jump_extended) + extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); diff --git a/crypto_sign/dilithium5/aarch64/ntt.h b/crypto_sign/dilithium5/aarch64/ntt.h index 9d00a098..8989c025 100644 --- a/crypto_sign/dilithium5/aarch64/ntt.h +++ b/crypto_sign/dilithium5/aarch64/ntt.h @@ -38,6 +38,10 @@ #include "NTT_params.h" #include +#define constants DILITHIUM_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define streamlined_GS_itable_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_GS_itable_Q1_jump_extended) + extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); From b895885d2b9f2d66952cbc50d1c70a0a16673fb8 Mon Sep 17 00:00:00 2001 From: James Edington Administrator Date: Wed, 7 Feb 2024 11:35:13 -0600 Subject: [PATCH 85/85] Bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index baad2edc..02913643 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pypqc" -version = "0.0.6.2-rc3" +version = "0.0.6.2-rc4" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"}