diff --git a/.github/workflows/bdist.yaml b/.github/workflows/bdist.yaml deleted file mode 100644 index 533c79893..000000000 --- a/.github/workflows/bdist.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: Build - -on: [push] - -jobs: - build_wheels: - name: bdist_wheel on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-20.04, windows-2019] - - steps: - - uses: actions/checkout@v4 - - # Used to host cibuildwheel - - uses: actions/setup-python@v3 - - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.16.3 - - - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse - env: - CIBW_BUILD_VERBOSITY: 1 - # Python 3.6 is EOL - # CFFI or maybe PQClean doesn't seem to work with musllinux - CIBW_SKIP: "cp36* *-musllinux_*" - # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E - CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" - CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" - - - uses: actions/upload-artifact@v4 - with: - name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} - path: ./wheelhouse/*.whl diff --git a/.github/workflows/python_build.yaml b/.github/workflows/python_build.yaml new file mode 100644 index 000000000..6d9af4a38 --- /dev/null +++ b/.github/workflows/python_build.yaml @@ -0,0 +1,99 @@ +name: Build + +on: [push] + +jobs: + sdist: + name: Source build for pip + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + - name: install dev requirements + run: python -m pip install -r requirements-dev.txt + - name: build sdist + run: python -m build --sdist && twine check ./dist/* + - name: upload sdist + uses: actions/upload-artifact@v4 + with: + name: sdist + path: ./dist/* + + bdist: + name: Binary build for ${{ matrix.py-impl }} on ${{ matrix.os }} + needs: sdist + runs-on: ${{ matrix.github_os }} + strategy: + fail-fast: false + matrix: + py-impl: ["CPython 3.X", "PyPy 3.X"] + os: [Windows, Mac, Linux, "Linux (Extra Architectures)"] + + include: + - py-impl: "CPython 3.X" + cibw_build: "cp3*" + - py-impl: "PyPy 3.X" + cibw_build: "pp3*" + # Use the oldest OSes available for compatibility + - os: Windows + github_os: windows-2019 + cibw_archs: AMD64 x86 ARM64 + - os: Mac + github_os: macos-11 + cibw_archs: x86_64 arm64 universal2 + - os: Linux + github_os: ubuntu-20.04 + cibw_archs: x86_64 i686 + - os: Linux (Extra Architectures) + github_os: ubuntu-20.04 + cibw_archs: aarch64 ppc64le s390x + + exclude: + - os: Linux (Extra Architectures) + # FIXME? cibuildwheel disagrees with this + py-impl: "PyPy 3.X" + + steps: + - uses: actions/download-artifact@v4 + with: + name: sdist + path: ./dist/ + - uses: tj-actions/glob@v20 + # FIXME? use a more programmatic or integrated solution here + id: sdist_glob + with: + files: ./dist/* + + # Used to host cibuildwheel + - uses: actions/setup-python@v3 + + - if: matrix.os == 'Linux (Extra Architectures)' + uses: docker/setup-qemu-action@v3 + + - name: Install cibuildwheel + run: python -m pip install cibuildwheel==2.16.3 + + - name: Build wheels + # https://github.com/pypa/cibuildwheel/issues/173#issuecomment-1501236916 + run: python -m cibuildwheel ${{ steps.sdist_glob.outputs.paths }} --output-dir ./dist + env: + CIBW_BUILD: ${{ matrix.cibw_build }} + CIBW_ARCHS: ${{ matrix.cibw_archs }} + CIBW_BUILD_VERBOSITY: 1 + # FIXME? cibuildwheel disagrees with CPython 3.6 in some way + # FIXME? PQClean GNU extensions break musl + # FIXME? delvewheel chokes specifically on CPython on Windows on ARM + CIBW_SKIP: > + cp36-* + *-musllinux_* + cp*-win*arm* + # https://cibuildwheel.readthedocs.io/en/stable/options/#:~:text=cibuildwheel%20doesn%27t%20yet%20ship%20a%20default%20repair%20command%20for%20Windows%2E + CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel" + CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" + + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} + path: ./dist/*.whl + if-no-files-found: error + diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/feat.S b/Modules/PQClean/common/keccak2x/feat.S similarity index 96% rename from Modules/PQClean/crypto_kem/kyber512/aarch64/feat.S rename to Modules/PQClean/common/keccak2x/feat.S index c214d6f3a..6c8e60beb 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/feat.S +++ b/Modules/PQClean/common/keccak2x/feat.S @@ -123,10 +123,10 @@ SOFTWARE. .endm .align 4 -.global PQCLEAN_KYBER512_AARCH64_f1600x2 -.global _PQCLEAN_KYBER512_AARCH64_f1600x2 -PQCLEAN_KYBER512_AARCH64_f1600x2: -_PQCLEAN_KYBER512_AARCH64_f1600x2: +.global f1600x2 +.global _f1600x2 +f1600x2: +_f1600x2: stp d8, d9, [sp,#-16]! stp d10, d11, [sp,#-16]! stp d12, d13, [sp,#-16]! diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/fips202x2.c b/Modules/PQClean/common/keccak2x/fips202x2.c similarity index 97% rename from Modules/PQClean/crypto_kem/kyber512/aarch64/fips202x2.c rename to Modules/PQClean/common/keccak2x/fips202x2.c index 464d53094..d0e8efdc8 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/fips202x2.c +++ b/Modules/PQClean/common/keccak2x/fips202x2.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -32,9 +33,8 @@ * SOFTWARE. */ -#include -#include #include "fips202x2.h" +#include #define NROUNDS 24 @@ -47,20 +47,20 @@ #define vxor(c, a, b) c = veorq_u64(a, b); // Rotate by n bit ((a << offset) ^ (a >> (64-offset))) #define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); + out = vshlq_n_u64(a, offset); \ + out = vsriq_n_u64(out, a, 64 - offset); // Xor chain: out = a ^ b ^ c ^ d ^ e #define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); // Not And c = ~a & b // #define vbic(c, a, b) c = vbicq_u64(b, a); // Xor Not And: out = a ^ ( (~b) & c) #define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); // Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support #define vrxor(c, a, b) c = vrax1q_u64(a, b); // End Define @@ -100,11 +100,11 @@ static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { * * Arguments: - uint64_t *state: pointer to input/output Keccak state **************************************************/ -extern void PQCLEAN_KYBER512_AARCH64_f1600x2(v128 *, const uint64_t *); +extern void f1600x2(v128 *, const uint64_t *); static inline void KeccakF1600_StatePermutex2(v128 state[25]) { #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_KYBER512_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); + f1600x2(state, neon_KeccakF_RoundConstants); #else v128 Aba, Abe, Abi, Abo, Abu; v128 Aga, Age, Agi, Ago, Agu; diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/fips202x2.h b/Modules/PQClean/common/keccak2x/fips202x2.h similarity index 80% rename from Modules/PQClean/crypto_kem/kyber512/aarch64/fips202x2.h rename to Modules/PQClean/common/keccak2x/fips202x2.h index 14ceb7827..1274a6d21 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/fips202x2.h +++ b/Modules/PQClean/common/keccak2x/fips202x2.h @@ -8,9 +8,9 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" -#include #include +#include +#include typedef uint64x2_t v128; @@ -23,31 +23,26 @@ typedef struct { v128 s[25]; } keccakx2_state; -#define shake128x2_absorb KYBER_NAMESPACE(shake128x2_absorb) void shake128x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake128x2_squeezeblocks KYBER_NAMESPACE(shake128x2_squeezeblocks) void shake128x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake256x2_absorb KYBER_NAMESPACE(shake256x2_absorb) void shake256x2_absorb(keccakx2_state *state, const uint8_t *in0, const uint8_t *in1, size_t inlen); -#define shake256x2_squeezeblocks KYBER_NAMESPACE(shake256x2_squeezeblocks) void shake256x2_squeezeblocks(uint8_t *out0, uint8_t *out1, size_t nblocks, keccakx2_state *state); -#define shake128x2 KYBER_NAMESPACE(shake128x2) void shake128x2(uint8_t *out0, uint8_t *out1, size_t outlen, @@ -55,7 +50,6 @@ void shake128x2(uint8_t *out0, const uint8_t *in1, size_t inlen); -#define shake256x2 KYBER_NAMESPACE(shake256x2) void shake256x2(uint8_t *out0, uint8_t *out1, size_t outlen, diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/LICENSE b/Modules/PQClean/crypto_kem/kyber1024/aarch64/LICENSE index 0e259d42c..093b0a7db 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/LICENSE +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/Makefile b/Modules/PQClean/crypto_kem/kyber1024/aarch64/Makefile index 82aded27c..9062653c3 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/Makefile +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/Makefile @@ -1,11 +1,15 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber1024_aarch64.a -HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h -OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o feat.o +HEADERS=api.h cbd.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h +OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) + all: $(LIB) %.o: %.c $(HEADERS) @@ -14,8 +18,11 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/NTT_params.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/NTT_params.h index d09348204..9bf560930 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/NTT_params.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/NTT_params.h @@ -1,8 +1,10 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_KYBER1024_AARCH64_NTT_PARAMS_H +#define PQCLEAN_KYBER1024_AARCH64_NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_NTT.S b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_NTT.S index 0469fcd6d..65fa23f99 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_NTT.S +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,165 +36,188 @@ PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top: _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top: - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 + push_simd + Q .req w8 + src .req x0 + table .req x1 + counter .req x11 ldrsh Q, [x2, #0] - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 + ldr q0, [table, # 0*16] + ldr q1, [table, # 1*16] + ldr q2, [table, # 2*16] + ldr q3, [table, # 3*16] mov v0.H[0], Q - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr q13, [src, # 9*32] + ldr q15, [src, #11*32] + ldr q17, [src, #13*32] + ldr q19, [src, #15*32] + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32 + + qo_butterfly_mixll \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32, \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixsls \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_botsls \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #9*32, #11*32, #13*32, #15*32, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32 + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) + + qo_butterfly_mixl \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixss \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_botss \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 + .unreq src .unreq table .unreq counter - pop_all - - br lr + pop_simd + ret .align 2 .global PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot @@ -199,13 +225,13 @@ _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top: PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: - push_all - Q .req w20 - BarrettM .req w21 + push_simd + Q .req w8 + BarrettM .req w9 src0 .req x0 src1 .req x1 - table .req x28 - counter .req x19 + table .req x10 + counter .req x11 ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] @@ -215,99 +241,127 @@ _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, # 1*16] + ldr q29, [src1, # 1*16] + ldr q30, [src0, # 3*16] + ldr q31, [src1, # 3*16] + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 mov counter, #3 _ntt_bot_loop: - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + str q28, [src0, # 1*16] + ldr q28, [src0, #(64+1*16)] + str q29, [src1, # 1*16] + ldr q29, [src1, #(64+1*16)] + str q30, [src0, # 3*16] + ldr q30, [src0, #(64+3*16)] + str q31, [src1, # 3*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + str q28, [src0, # 1*16] + str q29, [src1, # 1*16] + str q30, [src0, # 3*16] + str q31, [src1, # 3*16] + + add src0, src0, #64 + add src1, src1, #64 .unreq Q .unreq BarrettM @@ -315,12 +369,9 @@ _PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot: .unreq src1 .unreq table .unreq counter - pop_all - - br lr - - + pop_simd + ret diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_base_mul.S b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_base_mul.S index 1b7aed006..fe18783cc 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_base_mul.S +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_base_mul.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,44 +47,195 @@ _PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended: ldrsh Q, [x3] - dup v20.8H, Q - - // TODO: unroll this, currently we are using only 16 SIMD registers - mov counter, #4 - _point_mul_extended_loop: - - ld2 { v0.8H, v1.8H}, [src1], #32 - ld2 { v2.8H, v3.8H}, [src1], #32 - ld2 { v4.8H, v5.8H}, [src1], #32 - ld2 { v6.8H, v7.8H}, [src1], #32 + dup v28.8H, Q - ld2 { v8.8H, v9.8H}, [src2ex], #32 - ld2 {v10.8H, v11.8H}, [src2ex], #32 - ld2 {v12.8H, v13.8H}, [src2ex], #32 - ld2 {v14.8H, v15.8H}, [src2ex], #32 + ldr q0, [src1, #0*16] + ldr q1, [src1, #1*16] + ldr q2, [src1, #2*16] + ldr q3, [src1, #3*16] + ldr q4, [src1, #4*16] + ldr q5, [src1, #5*16] + ldr q6, [src1, #6*16] + ldr q7, [src1, #7*16] + + add src1, src1, #8*16 + + uzp2 v1.8H, v0.8H, v1.8H + uzp2 v3.8H, v2.8H, v3.8H + uzp2 v5.8H, v4.8H, v5.8H + uzp2 v7.8H, v6.8H, v7.8H + + ldr q8, [src2ex, #0*16] + ldr q10, [src2ex, #2*16] + ldr q12, [src2ex, #4*16] + ldr q14, [src2ex, #6*16] + ldr q9, [src2ex, #1*16] + ldr q11, [src2ex, #3*16] + ldr q13, [src2ex, #5*16] + ldr q15, [src2ex, #7*16] + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] + sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] + sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] + sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] + sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] + mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] + mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] + mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] + mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q0, [src1, #0*16] + sqrdmulh v16.8H, v17.8H, v8.8H + ldr q1, [src1, #1*16] + sqrdmulh v18.8H, v19.8H, v10.8H + ldr q2, [src1, #2*16] + sqrdmulh v20.8H, v21.8H, v12.8H + ldr q3, [src1, #3*16] + sqrdmulh v22.8H, v23.8H, v14.8H + + ldr q4, [src1, #4*16] + mul v17.8H, v17.8H, v9.8H + uzp2 v1.8H, v0.8H, v1.8H + ldr q5, [src1, #5*16] + mul v19.8H, v19.8H, v11.8H + uzp2 v3.8H, v2.8H, v3.8H + ldr q6, [src1, #6*16] + mul v21.8H, v21.8H, v13.8H + uzp2 v5.8H, v4.8H, v5.8H + ldr q7, [src1, #7*16] + mul v23.8H, v23.8H, v15.8H + uzp2 v7.8H, v6.8H, v7.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v17.8H, v16.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v19.8H, v18.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v21.8H, v20.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v23.8H, v22.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q17, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q19, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q21, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q23, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H - mls v1.8H, v0.8H, v20.8H - mls v3.8H, v2.8H, v20.8H - mls v5.8H, v4.8H, v20.8H - mls v7.8H, v6.8H, v20.8H + add src1, src1, #8*16 - st1 { v1.8H}, [des], #16 - st1 { v3.8H}, [des], #16 - st1 { v5.8H}, [des], #16 - st1 { v7.8H}, [des], #16 + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + sqrdmulh v16.8H, v17.8H, v8.8H + sqrdmulh v18.8H, v19.8H, v10.8H + sqrdmulh v20.8H, v21.8H, v12.8H + sqrdmulh v22.8H, v23.8H, v14.8H + + mul v17.8H, v17.8H, v9.8H + mul v19.8H, v19.8H, v11.8H + mul v21.8H, v21.8H, v13.8H + mul v23.8H, v23.8H, v15.8H + + mls v17.8H, v16.8H, v28.8H + mls v19.8H, v18.8H, v28.8H + mls v21.8H, v20.8H, v28.8H + mls v23.8H, v22.8H, v28.8H + + str q17, [des, #0*16] + str q19, [des, #1*16] + str q21, [des, #2*16] + str q23, [des, #3*16] + + add des, des, #4*16 - sub counter, counter, #1 - cbnz counter, _point_mul_extended_loop .unreq Q .unreq des @@ -90,7 +244,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended: .unreq counter pop_all - br lr + ret .align 2 @@ -100,8 +254,6 @@ PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: push_all - Q .req w28 - Qprime2 .req w27 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -117,8 +269,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] + ldr s4, [x3] add des, x4, #0 @@ -138,94 +289,294 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: add src2asy_3, src2asy_0, #256*3 #endif - dup v28.8H, Q - dup v29.8H, Qprime2 + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - // TODO:interleaving - mov counter, #16 - _asymmetric_mul_loop: + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + // TODO:interleaving + mov counter, #15 + _asymmetric_mul_loop: + + ldr q20, [src1_0, #0*16] + uzp1 v6.8H, v16.8H, v18.8H + ldr q21, [src1_0, #1*16] + uzp1 v7.8H, v17.8H, v19.8H + + ldr q22, [src2_0, #0*16] + mul v6.8H, v6.8H, v4.H[1] + ldr q23, [src2_0, #1*16] + mul v7.8H, v7.8H, v4.H[1] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + smlal v16.4S, v6.4H, v4.H[0] + uzp1 v0.8H, v20.8H, v21.8H + smlal2 v18.4S, v6.8H, v4.H[0] + uzp2 v1.8H, v20.8H, v21.8H + smlal v17.4S, v7.4H, v4.H[0] + uzp1 v2.8H, v22.8H, v23.8H + smlal2 v19.4S, v7.8H, v4.H[0] + uzp2 v3.8H, v22.8H, v23.8H + + ld1 {v28.8H}, [src2asy_0], #16 + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H +#endif + +#else - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - st2 {v24.8H, v25.8H}, [des], #32 +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_loop - .unreq Q - .unreq Qprime2 + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -242,7 +593,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul: .unreq counter pop_all - br lr + ret .align 2 @@ -252,10 +603,6 @@ PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: push_all - Q .req w28 - Qprime2 .req w27 - R3 .req w26 - R3p .req w25 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -271,11 +618,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] - - ldrsh R3, [x3, #8] - ldrsh R3p, [x3, #10] + ldr q4, [x3] add des, x4, #0 @@ -295,108 +638,312 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: add src2asy_3, src2asy_0, #256*3 #endif - dup v26.8H, R3 - dup v27.8H, R3p + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - dup v28.8H, Q - dup v29.8H, Qprime2 + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 - // TODO: interleaving - mov counter, #16 - _asymmetric_mul_montgomery_loop: + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + mov counter, #15 + _asymmetric_mul_montgomery_loop: + + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + ldr q20, [src1_0, #0*16] + smlal v16.4S, v6.4H, v4.H[0] + ldr q21, [src1_0, #1*16] + smlal2 v18.4S, v6.8H, v4.H[0] + ldr q22, [src2_0, #0*16] + smlal v17.4S, v7.4H, v4.H[0] + ldr q23, [src2_0, #1*16] + smlal2 v19.4S, v7.8H, v4.H[0] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H + uzp1 v0.8H, v20.8H, v21.8H + sqrdmulh v16.8H, v6.8H, v4.H[4] + uzp2 v1.8H, v20.8H, v21.8H + sqrdmulh v17.8H, v7.8H, v4.H[4] - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + uzp1 v2.8H, v22.8H, v23.8H + mul v6.8H, v6.8H, v4.H[5] + uzp2 v3.8H, v22.8H, v23.8H + mul v7.8H, v7.8H, v4.H[5] - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] - sqrdmulh v16.8H, v24.8H, v26.8H - sqrdmulh v17.8H, v25.8H, v26.8H + st2 { v6.8H, v7.8H}, [des], #32 - mul v24.8H, v24.8H, v27.8H - mul v25.8H, v25.8H, v27.8H + ld1 {v28.8H}, [src2asy_0], #16 - mls v24.8H, v16.8H, v28.8H - mls v25.8H, v17.8H, v28.8H + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H + +#endif - st2 {v24.8H, v25.8H}, [des], #32 +#else + + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H + +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_montgomery_loop - .unreq Q - .unreq Qprime2 - .unreq R3 - .unreq R3p + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + sqrdmulh v16.8H, v6.8H, v4.H[4] + sqrdmulh v17.8H, v7.8H, v4.H[4] + + mul v6.8H, v6.8H, v4.H[5] + mul v7.8H, v7.8H, v4.H[5] + + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -413,7 +960,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery: .unreq counter pop_all - br lr + ret diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_iNTT.S b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_iNTT.S index 930b519c0..58a524ac2 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_iNTT.S +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -49,57 +52,116 @@ _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - mov counter, #4 + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, #1*16] + ldr q29, [src1, #1*16] + ldr q30, [src0, #3*16] + ldr q31, [src1, #3*16] + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 + + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 + + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 + + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 + + mov counter, #3 _intt_bot_loop: - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM + str q24, [src0, #0*16] + ldr q28, [src0, #(64+1*16)] + str q25, [src1, #0*16] + ldr q29, [src1, #(64+1*16)] + str q26, [src0, #2*16] + ldr q30, [src0, #(64+3*16)] + str q27, [src1, #2*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 - do_butterfly_vec_mixed_rev v28, v30, v18, v19, v29, v31, v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, v8, v9, v10, v11 - do_butterfly_vec_mixed_rev v24, v26, v16, v17, v25, v27, v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, v6, v7, v6, v7 - do_butterfly_vec_mixed_rev v28, v29, v18, v19, v30, v31, v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, v4, v5, v4, v5 - do_butterfly_vec_mixed_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 - do_butterfly_vec_mixed_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 - qo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v1, #11, v0 + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 sub counter, counter, #1 cbnz counter, _intt_bot_loop + str q24, [src0, #0*16] + str q25, [src1, #0*16] + str q26, [src0, #2*16] + str q27, [src1, #2*16] + + .unreq Q .unreq BarrettM .unreq src0 @@ -108,7 +170,7 @@ _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot: .unreq counter pop_all - br lr + ret .align 2 .global PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top @@ -121,245 +183,131 @@ _PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top: BarrettM .req w21 invN .req w22 invN_f .req w23 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 + table .req x1 counter .req x19 - ldrsh Q, [x2, #0] + ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] - ldr invN, [x2, #10] - ldr invN_f, [x2, #14] - - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 - - mov v0.H[0], Q - - dup v24.8H, Q - dup v25.8H, BarrettM - - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr invN, [x2, #10] + ldr invN_f, [x2, #14] + + mov v4.S[0], invN + mov v4.S[1], invN_f + + ldr q0, [table, #0*16] + mov v0.H[0], Q + + ldr q1, [table, #1*16] + ldr q2, [table, #2*16] + ldr q3, [table, #3*16] + + ldr q16, [src, # 8*32] + ldr q17, [src, # 9*32] + ldr q18, [src, #10*32] + ldr q19, [src, #11*32] + ldr q20, [src, #12*32] + ldr q21, [src, #13*32] + ldr q22, [src, #14*32] + ldr q23, [src, #15*32] + + qo_butterfly_botll \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q12, q13, q14, q15, \ + #4*32, #5*32, #6*32, #7*32 + + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_topsl \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #8*32, #9*32, #10*32, #11*32, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + qo_montgomery_mul_insl \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #12*32, #13*32, #14*32, #15*32, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_butterfly_botsl_mul \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32), \ + v12, v13, v14, v15, v24, v25, v26, v27, \ + v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0 + + str q12, [src, # 4*32] + ldr q12, [src, #(16+ 4*32)] + str q13, [src, # 5*32] + ldr q13, [src, #(16+ 5*32)] + str q14, [src, # 6*32] + ldr q14, [src, #(16+ 6*32)] + str q15, [src, # 7*32] + ldr q15, [src, #(16+ 7*32)] + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_tops \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + + qo_montgomery_mul_ins \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_montgomery_mul_ins \ + v12, v13, v14, v15, v24, v25, v26, v27, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32) + + str q12, [src, #(16+ 4*32)] + str q13, [src, #(16+ 5*32)] + str q14, [src, #(16+ 6*32)] + str q15, [src, #(16+ 7*32)] .unreq Q .unreq BarrettM .unreq invN .unreq invN_f - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr - - - - + ret diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_poly.S b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_poly.S index 00fec3d05..7d461016c 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_poly.S +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_KYBER1024_AARCH64_asm_add_reduce -.global _PQCLEAN_KYBER1024_AARCH64_asm_add_reduce -PQCLEAN_KYBER1024_AARCH64_asm_add_reduce: -_PQCLEAN_KYBER1024_AARCH64_asm_add_reduce: +.global PQCLEAN_KYBER1024_AARCH64__asm_add_reduce +.global _PQCLEAN_KYBER1024_AARCH64__asm_add_reduce +PQCLEAN_KYBER1024_AARCH64__asm_add_reduce: +_PQCLEAN_KYBER1024_AARCH64__asm_add_reduce: mov w4, #3329 mov w5, #25519 @@ -86,13 +89,13 @@ _PQCLEAN_KYBER1024_AARCH64_asm_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce -.global _PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce -PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce: -_PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce: +.global PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce +.global _PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce +PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce: +_PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce: mov w4, #3329 mov w5, #25519 @@ -147,13 +150,13 @@ _PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce -.global _PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce -PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce: -_PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce: +.global PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce +.global _PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce +PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce: +_PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce: mov w4, #3329 mov w5, #25519 @@ -232,7 +235,7 @@ _PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v20.8H, v21.8H, v22.8H, v23.8H}, [x0], #64 - br lr + ret diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/api.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/api.h index e09f90cbc..00373e019 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/api.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/api.h @@ -13,7 +13,7 @@ #define PQCLEAN_KYBER1024_AARCH64_CRYPTO_PUBLICKEYBYTES 1568 #define PQCLEAN_KYBER1024_AARCH64_CRYPTO_CIPHERTEXTBYTES 1568 #define PQCLEAN_KYBER1024_AARCH64_CRYPTO_BYTES 32 -#define PQCLEAN_KYBER1024_AARCH64_CRYPTO_ALGNAME "Kyber1024" +#define PQCLEAN_KYBER1024_AARCH64_CRYPTO_ALGNAME "Kyber1024" int PQCLEAN_KYBER1024_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/cbd.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/cbd.h index 47a06806e..ca8ae2b9e 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/cbd.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/cbd.h @@ -1,5 +1,5 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER1024_AARCH64_CBD_H +#define PQCLEAN_KYBER1024_AARCH64_CBD_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "poly.h" -#include #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1) void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/feat.S b/Modules/PQClean/crypto_kem/kyber1024/aarch64/feat.S deleted file mode 100644 index d7dda5bc4..000000000 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global PQCLEAN_KYBER1024_AARCH64_f1600x2 -.global _PQCLEAN_KYBER1024_AARCH64_f1600x2 -PQCLEAN_KYBER1024_AARCH64_f1600x2: -_PQCLEAN_KYBER1024_AARCH64_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/fips202x2.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/fips202x2.c deleted file mode 100644 index 3cefe8481..000000000 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/fips202x2.c +++ /dev/null @@ -1,684 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We choose - * CC0 1.0 Universal or the following MIT License for this file. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void PQCLEAN_KYBER1024_AARCH64_f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_KYBER1024_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/fips202x2.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/fips202x2.h deleted file mode 100644 index 14ceb7827..000000000 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/fips202x2.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include "params.h" -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -#define shake128x2_absorb KYBER_NAMESPACE(shake128x2_absorb) -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake128x2_squeezeblocks KYBER_NAMESPACE(shake128x2_squeezeblocks) -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake256x2_absorb KYBER_NAMESPACE(shake256x2_absorb) -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2_squeezeblocks KYBER_NAMESPACE(shake256x2_squeezeblocks) -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake128x2 KYBER_NAMESPACE(shake128x2) -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2 KYBER_NAMESPACE(shake256x2) -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#endif diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.c index 8648f17bf..5b8c5f3ad 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.h index f93487a37..6357746ce 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/indcpa.h @@ -1,5 +1,5 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER1024_AARCH64_INDCPA_H +#define PQCLEAN_KYBER1024_AARCH64_INDCPA_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "polyvec.h" -#include #define gen_matrix KYBER_NAMESPACE(gen_matrix) void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.c index 670a4c599..572b5e93a 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.c @@ -8,12 +8,14 @@ #include #include #include + +#include "api.h" #include "params.h" +#include "kem.h" #include "indcpa.h" #include "verify.h" #include "symmetric.h" #include "randombytes.h" -#include "kem.h" /************************************************* * Name: crypto_kem_keypair_derand diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.h index f6f0bd692..b674a77fc 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/kem.h @@ -1,5 +1,5 @@ -#ifndef KEM_H -#define KEM_H +#ifndef PQCLEAN_KYBER1024_AARCH64_KEM_H +#define PQCLEAN_KYBER1024_AARCH64_KEM_H /* * This file is licensed @@ -7,13 +7,8 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include - -#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES -#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES -#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES -#define CRYPTO_BYTES KYBER_SSBYTES +#include "params.h" #define CRYPTO_ALGNAME "Kyber1024" @@ -33,3 +28,4 @@ int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif + diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros.inc b/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros.inc index 2add309e0..5504405c1 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros.inc +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,11 +28,114 @@ * SOFTWARE. */ -#ifndef MACROS_S -#define MACROS_S - #include "macros_common.inc" +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + .macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H .endm @@ -52,16 +158,68 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + .macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm -.macro qo_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm -.macro qo_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H .endm @@ -69,18 +227,176 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + .macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm -.macro do_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H .endm -.macro do_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm -#endif +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== + +.macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 +.endm + +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm +// === 32-bit end ==== diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros_common.inc b/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros_common.inc index c1ac021cd..07568491d 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros_common.inc +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,31 +35,51 @@ .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -75,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -85,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -102,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -138,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -152,34 +310,340 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] mul \t4\wX, \b4\wX, \z4\nX[\h4] sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] mul \t5\wX, \b5\wX, \z5\nX[\h5] sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] mul \t6\wX, \b6\wX, \z6\nX[\h6] sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] mul \t7\wX, \b7\wX, \z7\nX[\h7] + ldr \d0, [\srcd_ptr, \memd0] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] add \a2\wX, \a2\wX, \t2\wX sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] add \a3\wX, \a3\wX, \t3\wX sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -221,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -231,7 +769,53 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D + +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX @@ -248,15 +832,77 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + ldr \c0, [\srcc_ptr, \memc0] mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] mul \t1\wX, \b1\wX, \h1\wX sub \b3\wX, \a3\wX, \t3\wX + ldr \c2, [\srcc_ptr, \memc2] sqrdmulh \b0\wX, \b0\wX, \l0\wX add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] sqrdmulh \b1\wX, \b1\wX, \l1\wX add \a3\wX, \a3\wX, \t3\wX @@ -269,53 +915,53 @@ .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -394,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_poly.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_poly.c index fc9581020..e7ae26ba1 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_poly.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_poly.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -129,14 +130,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(int16_t *, const int16_t *); void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER1024_AARCH64_asm_add_reduce(c, a); + PQCLEAN_KYBER1024_AARCH64__asm_add_reduce(c, a); } -extern void PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); +extern void PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) { - PQCLEAN_KYBER1024_AARCH64_asm_add_add_reduce(c, a, b); + PQCLEAN_KYBER1024_AARCH64__asm_add_add_reduce(c, a, b); } /************************************************* @@ -150,7 +151,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(int16_t *, const int16_t *); void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER1024_AARCH64_asm_sub_reduce(c, a); + PQCLEAN_KYBER1024_AARCH64__asm_sub_reduce(c, a); } diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_polyvec.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_polyvec.c index c05f59d66..8787fcde6 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_polyvec.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_polyvec.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -83,7 +84,7 @@ void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]) { * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]) { +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { // c = c + a; @@ -91,4 +92,3 @@ void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYB neon_poly_add_reduce(c[i], a[i]); } } - diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c index a5a2e7833..9a59724e7 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/neon_symmetric-shake.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -35,7 +36,7 @@ #include #include #include "params.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "symmetric.h" /************************************************* @@ -88,8 +89,8 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce1, uint8_t nonce2) { unsigned int i; - uint8_t extkey1[KYBER_SYMBYTES + 1 + 15]; - uint8_t extkey2[KYBER_SYMBYTES + 1 + 15]; + uint8_t extkey1[KYBER_SYMBYTES + 1]; + uint8_t extkey2[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey1[i] = key[i]; @@ -99,5 +100,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, extkey1[i] = nonce1; extkey2[i] = nonce2; - shake256x2(out1, out2, outlen, extkey1, extkey2, KYBER_SYMBYTES + 1); + shake256x2(out1, out2, outlen, extkey1, extkey2, sizeof(extkey1)); } diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.c index 8bca765e2..09583b73f 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.c @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,12 +28,35 @@ * SOFTWARE. */ -#include +#include #include "params.h" #include "ntt.h" -#include "reduce.h" #include "NTT_params.h" +const __attribute__ ((aligned (16)))int16_t asymmetric_const[8] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t constants[16] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, + invNQ1_R3modQ1_prime_half, + invNQ1_R3modQ1_doubleprime, + invNQ1_final_R3modQ1_prime_half, + invNQ1_final_R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { + 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 +}; + +const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { + 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 +}; + /************************************************* * Name: ntt * diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.h index fc6d4a94c..5eaa0bdbd 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/ntt.h @@ -1,12 +1,15 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER1024_AARCH64_NTT_H +#define PQCLEAN_KYBER1024_AARCH64_NTT_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,12 +35,11 @@ #include "NTT_params.h" -extern const int16_t zetas[128]; - -#define ntt KYBER_NAMESPACE(ntt) -void ntt(int16_t r[256]); -#define invntt KYBER_NAMESPACE(invntt) -void invntt(int16_t r[256]); +#define asymmetric_const KYBER_NAMESPACE(asymmetric_const) +#define constants KYBER_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define pre_asymmetric_table_Q1_extended KYBER_NAMESPACE(pre_asymmetric_table_Q1_extended) +#define streamlined_inv_GS_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_inv_GS_negacyclic_table_Q1_jump_extended) extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); @@ -49,38 +51,33 @@ extern void PQCLEAN_KYBER1024_AARCH64__asm_point_mul_extended(int16_t *, const i extern void PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); extern void PQCLEAN_KYBER1024_AARCH64__asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); -static const int16_t asymmetric_const[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime -}; +extern +const int16_t asymmetric_const[8]; +extern +const int16_t constants[16]; -#define NTT(in) { \ - PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; -#define iNTT(in) { \ - PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +extern +const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; -static const int16_t constants[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, - invNQ1_R3modQ1_prime_half, - invNQ1_R3modQ1_doubleprime, - invNQ1_final_R3modQ1_prime_half, - invNQ1_final_R3modQ1_doubleprime -}; +extern +const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; -static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0 -}; +#define NTT(in) do { \ + PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER1024_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) -static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { - 167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175 -}; +#define iNTT(in) do { \ + PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER1024_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) -static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0 -}; +#define ntt KYBER_NAMESPACE(ntt) +void ntt(int16_t r[256]); +#define invntt KYBER_NAMESPACE(invntt) +void invntt(int16_t r[256]); #endif diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/params.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/params.h index f6cb8131a..455d12a42 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/params.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/params.h @@ -7,11 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -//#define KYBER_90S /* Uncomment this if you want the 90S variant */ - #define KYBER_NAMESPACE(s) PQCLEAN_KYBER1024_AARCH64_##s -#define KYBER_K 4 +/* Don't change parameters below this line */ #define KYBER_N 256 #define KYBER_Q 3329 @@ -21,6 +19,7 @@ #define KYBER_POLYBYTES 384 #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) +#define KYBER_K 4 #define KYBER_ETA1 2 #define KYBER_POLYCOMPRESSEDBYTES 160 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.c index 6250e60a0..0c26205d2 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/kyber/blob/master/ref * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.h index 83c35067e..2882f11f5 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER1024_AARCH64_POLY_H +#define PQCLEAN_KYBER1024_AARCH64_POLY_H /* * This file is licensed @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial @@ -30,7 +30,7 @@ void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const int16_t a[KYBER_N]); #define poly_frommsg KYBER_NAMESPACE(poly_frommsg) void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]); #define poly_tomsg KYBER_NAMESPACE(poly_tomsg) -void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t a[KYBER_N]); +void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t r[KYBER_N]); // NEON diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.c b/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.c index 7142cb39c..f382543aa 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.c +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.c @@ -19,7 +19,7 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; uint16_t t[8]; @@ -89,7 +89,7 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV * (needs space for KYBER_POLYVECBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { poly_tobytes(r + i * KYBER_POLYBYTES, a[i]); @@ -112,4 +112,3 @@ void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVE poly_frombytes(r[i], a + i * KYBER_POLYBYTES); } } - diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.h index 827610d63..835db2927 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER1024_AARCH64_POLYVEC_H +#define PQCLEAN_KYBER1024_AARCH64_POLYVEC_H /* * This file was originally licensed @@ -7,8 +7,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -34,21 +35,21 @@ * SOFTWARE. */ +#include #include "params.h" #include "poly.h" -#include typedef struct { poly vec[KYBER_K]; } polyvec; #define polyvec_compress KYBER_NAMESPACE(polyvec_compress) -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress) void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes) -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes) void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECBYTES]); @@ -61,6 +62,6 @@ void neon_polyvec_ntt(int16_t r[KYBER_K][KYBER_N]); void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]); #define neon_polyvec_add_reduce KYBER_NAMESPACE(polyvec_add_reduce) -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]); +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]); #endif diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/reduce.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/reduce.h index 7d0f8e3bc..1cb8265b6 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/reduce.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER1024_AARCH64_REDUCE_H +#define PQCLEAN_KYBER1024_AARCH64_REDUCE_H /* * This file is licensed @@ -7,8 +7,8 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" #define MONT (-1044) // 2^16 mod q #define QINV (-3327) // q^-1 mod 2^16 diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/rejsample.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/rejsample.h index ee9ae85c8..40d8dce6c 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/rejsample.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/rejsample.h @@ -1,5 +1,5 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H +#ifndef PQCLEAN_KYBER1024_AARCH64_REJSAMPLE_H +#define PQCLEAN_KYBER1024_AARCH64_REJSAMPLE_H /* * This file is licensed @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" #define neon_rej_uniform KYBER_NAMESPACE(_neon_rej_uniform) unsigned int neon_rej_uniform(int16_t *r, diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/symmetric.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/symmetric.h index cb9ea69e8..0a8b8f8aa 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/symmetric.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_KYBER1024_AARCH64_SYMMETRIC_H +#define PQCLEAN_KYBER1024_AARCH64_SYMMETRIC_H /* * This file is licensed @@ -8,9 +8,9 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include #include +#include "params.h" #include "fips202.h" @@ -37,7 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) // NEON Definition -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" typedef keccakx2_state neon_xof_state; diff --git a/Modules/PQClean/crypto_kem/kyber1024/aarch64/verify.h b/Modules/PQClean/crypto_kem/kyber1024/aarch64/verify.h index 3b9eca9f6..4819e0dbd 100644 --- a/Modules/PQClean/crypto_kem/kyber1024/aarch64/verify.h +++ b/Modules/PQClean/crypto_kem/kyber1024/aarch64/verify.h @@ -1,5 +1,5 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER1024_AARCH64_VERIFY_H +#define PQCLEAN_KYBER1024_AARCH64_VERIFY_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include #include +#include "params.h" #define verify KYBER_NAMESPACE(verify) int verify(const uint8_t *a, const uint8_t *b, size_t len); diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/LICENSE b/Modules/PQClean/crypto_kem/kyber512/aarch64/LICENSE index 0e259d42c..093b0a7db 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/LICENSE +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/Makefile b/Modules/PQClean/crypto_kem/kyber512/aarch64/Makefile index c6affa409..40cc55b1f 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/Makefile +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/Makefile @@ -1,11 +1,15 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber512_aarch64.a -HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h -OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o feat.o +HEADERS=api.h cbd.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h +OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) + all: $(LIB) %.o: %.c $(HEADERS) @@ -14,8 +18,12 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) + clean: $(RM) $(OBJECTS) diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/NTT_params.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/NTT_params.h index d09348204..ccde0122e 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/NTT_params.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/NTT_params.h @@ -1,8 +1,10 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_KYBER512_AARCH64_NTT_PARAMS_H +#define PQCLEAN_KYBER512_AARCH64_NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_NTT.S b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_NTT.S index 47b75efa8..ebcf6bd53 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_NTT.S +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,165 +36,188 @@ PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top: _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top: - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 + push_simd + Q .req w8 + src .req x0 + table .req x1 + counter .req x11 ldrsh Q, [x2, #0] - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 + ldr q0, [table, # 0*16] + ldr q1, [table, # 1*16] + ldr q2, [table, # 2*16] + ldr q3, [table, # 3*16] mov v0.H[0], Q - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr q13, [src, # 9*32] + ldr q15, [src, #11*32] + ldr q17, [src, #13*32] + ldr q19, [src, #15*32] + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32 + + qo_butterfly_mixll \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32, \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixsls \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_botsls \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #9*32, #11*32, #13*32, #15*32, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32 + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) + + qo_butterfly_mixl \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixss \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_botss \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 + .unreq src .unreq table .unreq counter - pop_all - - br lr + pop_simd + ret .align 2 .global PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot @@ -199,13 +225,13 @@ _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top: PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: - push_all - Q .req w20 - BarrettM .req w21 + push_simd + Q .req w8 + BarrettM .req w9 src0 .req x0 src1 .req x1 - table .req x28 - counter .req x19 + table .req x10 + counter .req x11 ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] @@ -215,99 +241,127 @@ _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, # 1*16] + ldr q29, [src1, # 1*16] + ldr q30, [src0, # 3*16] + ldr q31, [src1, # 3*16] + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 mov counter, #3 _ntt_bot_loop: - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + str q28, [src0, # 1*16] + ldr q28, [src0, #(64+1*16)] + str q29, [src1, # 1*16] + ldr q29, [src1, #(64+1*16)] + str q30, [src0, # 3*16] + ldr q30, [src0, #(64+3*16)] + str q31, [src1, # 3*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + str q28, [src0, # 1*16] + str q29, [src1, # 1*16] + str q30, [src0, # 3*16] + str q31, [src1, # 3*16] + + add src0, src0, #64 + add src1, src1, #64 .unreq Q .unreq BarrettM @@ -315,12 +369,9 @@ _PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot: .unreq src1 .unreq table .unreq counter - pop_all - - br lr - - + pop_simd + ret diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_base_mul.S b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_base_mul.S index 1c346564c..a75bb6497 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_base_mul.S +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_base_mul.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,44 +47,195 @@ _PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended: ldrsh Q, [x3] - dup v20.8H, Q - - // TODO: unroll this, currently we are using only 16 SIMD registers - mov counter, #4 - _point_mul_extended_loop: - - ld2 { v0.8H, v1.8H}, [src1], #32 - ld2 { v2.8H, v3.8H}, [src1], #32 - ld2 { v4.8H, v5.8H}, [src1], #32 - ld2 { v6.8H, v7.8H}, [src1], #32 + dup v28.8H, Q - ld2 { v8.8H, v9.8H}, [src2ex], #32 - ld2 {v10.8H, v11.8H}, [src2ex], #32 - ld2 {v12.8H, v13.8H}, [src2ex], #32 - ld2 {v14.8H, v15.8H}, [src2ex], #32 + ldr q0, [src1, #0*16] + ldr q1, [src1, #1*16] + ldr q2, [src1, #2*16] + ldr q3, [src1, #3*16] + ldr q4, [src1, #4*16] + ldr q5, [src1, #5*16] + ldr q6, [src1, #6*16] + ldr q7, [src1, #7*16] + + add src1, src1, #8*16 + + uzp2 v1.8H, v0.8H, v1.8H + uzp2 v3.8H, v2.8H, v3.8H + uzp2 v5.8H, v4.8H, v5.8H + uzp2 v7.8H, v6.8H, v7.8H + + ldr q8, [src2ex, #0*16] + ldr q10, [src2ex, #2*16] + ldr q12, [src2ex, #4*16] + ldr q14, [src2ex, #6*16] + ldr q9, [src2ex, #1*16] + ldr q11, [src2ex, #3*16] + ldr q13, [src2ex, #5*16] + ldr q15, [src2ex, #7*16] + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] + sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] + sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] + sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] + sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] + mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] + mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] + mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] + mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q0, [src1, #0*16] + sqrdmulh v16.8H, v17.8H, v8.8H + ldr q1, [src1, #1*16] + sqrdmulh v18.8H, v19.8H, v10.8H + ldr q2, [src1, #2*16] + sqrdmulh v20.8H, v21.8H, v12.8H + ldr q3, [src1, #3*16] + sqrdmulh v22.8H, v23.8H, v14.8H + + ldr q4, [src1, #4*16] + mul v17.8H, v17.8H, v9.8H + uzp2 v1.8H, v0.8H, v1.8H + ldr q5, [src1, #5*16] + mul v19.8H, v19.8H, v11.8H + uzp2 v3.8H, v2.8H, v3.8H + ldr q6, [src1, #6*16] + mul v21.8H, v21.8H, v13.8H + uzp2 v5.8H, v4.8H, v5.8H + ldr q7, [src1, #7*16] + mul v23.8H, v23.8H, v15.8H + uzp2 v7.8H, v6.8H, v7.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v17.8H, v16.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v19.8H, v18.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v21.8H, v20.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v23.8H, v22.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q17, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q19, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q21, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q23, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H - mls v1.8H, v0.8H, v20.8H - mls v3.8H, v2.8H, v20.8H - mls v5.8H, v4.8H, v20.8H - mls v7.8H, v6.8H, v20.8H + add src1, src1, #8*16 - st1 { v1.8H}, [des], #16 - st1 { v3.8H}, [des], #16 - st1 { v5.8H}, [des], #16 - st1 { v7.8H}, [des], #16 + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + sqrdmulh v16.8H, v17.8H, v8.8H + sqrdmulh v18.8H, v19.8H, v10.8H + sqrdmulh v20.8H, v21.8H, v12.8H + sqrdmulh v22.8H, v23.8H, v14.8H + + mul v17.8H, v17.8H, v9.8H + mul v19.8H, v19.8H, v11.8H + mul v21.8H, v21.8H, v13.8H + mul v23.8H, v23.8H, v15.8H + + mls v17.8H, v16.8H, v28.8H + mls v19.8H, v18.8H, v28.8H + mls v21.8H, v20.8H, v28.8H + mls v23.8H, v22.8H, v28.8H + + str q17, [des, #0*16] + str q19, [des, #1*16] + str q21, [des, #2*16] + str q23, [des, #3*16] + + add des, des, #4*16 - sub counter, counter, #1 - cbnz counter, _point_mul_extended_loop .unreq Q .unreq des @@ -90,7 +244,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended: .unreq counter pop_all - br lr + ret .align 2 @@ -100,8 +254,6 @@ PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: push_all - Q .req w28 - Qprime2 .req w27 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -117,8 +269,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] + ldr s4, [x3] add des, x4, #0 @@ -138,94 +289,294 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: add src2asy_3, src2asy_0, #256*3 #endif - dup v28.8H, Q - dup v29.8H, Qprime2 + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - // TODO:interleaving - mov counter, #16 - _asymmetric_mul_loop: + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + // TODO:interleaving + mov counter, #15 + _asymmetric_mul_loop: + + ldr q20, [src1_0, #0*16] + uzp1 v6.8H, v16.8H, v18.8H + ldr q21, [src1_0, #1*16] + uzp1 v7.8H, v17.8H, v19.8H + + ldr q22, [src2_0, #0*16] + mul v6.8H, v6.8H, v4.H[1] + ldr q23, [src2_0, #1*16] + mul v7.8H, v7.8H, v4.H[1] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + smlal v16.4S, v6.4H, v4.H[0] + uzp1 v0.8H, v20.8H, v21.8H + smlal2 v18.4S, v6.8H, v4.H[0] + uzp2 v1.8H, v20.8H, v21.8H + smlal v17.4S, v7.4H, v4.H[0] + uzp1 v2.8H, v22.8H, v23.8H + smlal2 v19.4S, v7.8H, v4.H[0] + uzp2 v3.8H, v22.8H, v23.8H + + ld1 {v28.8H}, [src2asy_0], #16 + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H +#endif + +#else - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - st2 {v24.8H, v25.8H}, [des], #32 +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_loop - .unreq Q - .unreq Qprime2 + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -242,7 +593,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul: .unreq counter pop_all - br lr + ret .align 2 @@ -252,10 +603,6 @@ PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: push_all - Q .req w28 - Qprime2 .req w27 - R3 .req w26 - R3p .req w25 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -271,11 +618,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] - - ldrsh R3, [x3, #8] - ldrsh R3p, [x3, #10] + ldr q4, [x3] add des, x4, #0 @@ -295,108 +638,312 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: add src2asy_3, src2asy_0, #256*3 #endif - dup v26.8H, R3 - dup v27.8H, R3p + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - dup v28.8H, Q - dup v29.8H, Qprime2 + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 - // TODO: interleaving - mov counter, #16 - _asymmetric_mul_montgomery_loop: + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + mov counter, #15 + _asymmetric_mul_montgomery_loop: + + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + ldr q20, [src1_0, #0*16] + smlal v16.4S, v6.4H, v4.H[0] + ldr q21, [src1_0, #1*16] + smlal2 v18.4S, v6.8H, v4.H[0] + ldr q22, [src2_0, #0*16] + smlal v17.4S, v7.4H, v4.H[0] + ldr q23, [src2_0, #1*16] + smlal2 v19.4S, v7.8H, v4.H[0] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H + uzp1 v0.8H, v20.8H, v21.8H + sqrdmulh v16.8H, v6.8H, v4.H[4] + uzp2 v1.8H, v20.8H, v21.8H + sqrdmulh v17.8H, v7.8H, v4.H[4] - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + uzp1 v2.8H, v22.8H, v23.8H + mul v6.8H, v6.8H, v4.H[5] + uzp2 v3.8H, v22.8H, v23.8H + mul v7.8H, v7.8H, v4.H[5] - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] - sqrdmulh v16.8H, v24.8H, v26.8H - sqrdmulh v17.8H, v25.8H, v26.8H + st2 { v6.8H, v7.8H}, [des], #32 - mul v24.8H, v24.8H, v27.8H - mul v25.8H, v25.8H, v27.8H + ld1 {v28.8H}, [src2asy_0], #16 - mls v24.8H, v16.8H, v28.8H - mls v25.8H, v17.8H, v28.8H + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H + +#endif - st2 {v24.8H, v25.8H}, [des], #32 +#else + + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H + +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_montgomery_loop - .unreq Q - .unreq Qprime2 - .unreq R3 - .unreq R3p + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + sqrdmulh v16.8H, v6.8H, v4.H[4] + sqrdmulh v17.8H, v7.8H, v4.H[4] + + mul v6.8H, v6.8H, v4.H[5] + mul v7.8H, v7.8H, v4.H[5] + + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -413,7 +960,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery: .unreq counter pop_all - br lr + ret diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_iNTT.S b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_iNTT.S index 7acb200f4..57fb734f5 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_iNTT.S +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -49,57 +52,116 @@ _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - mov counter, #4 + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, #1*16] + ldr q29, [src1, #1*16] + ldr q30, [src0, #3*16] + ldr q31, [src1, #3*16] + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 + + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 + + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 + + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 + + mov counter, #3 _intt_bot_loop: - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM + str q24, [src0, #0*16] + ldr q28, [src0, #(64+1*16)] + str q25, [src1, #0*16] + ldr q29, [src1, #(64+1*16)] + str q26, [src0, #2*16] + ldr q30, [src0, #(64+3*16)] + str q27, [src1, #2*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 - do_butterfly_vec_mixed_rev v28, v30, v18, v19, v29, v31, v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, v8, v9, v10, v11 - do_butterfly_vec_mixed_rev v24, v26, v16, v17, v25, v27, v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, v6, v7, v6, v7 - do_butterfly_vec_mixed_rev v28, v29, v18, v19, v30, v31, v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, v4, v5, v4, v5 - do_butterfly_vec_mixed_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 - do_butterfly_vec_mixed_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 - qo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v1, #11, v0 + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 sub counter, counter, #1 cbnz counter, _intt_bot_loop + str q24, [src0, #0*16] + str q25, [src1, #0*16] + str q26, [src0, #2*16] + str q27, [src1, #2*16] + + .unreq Q .unreq BarrettM .unreq src0 @@ -108,7 +170,7 @@ _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot: .unreq counter pop_all - br lr + ret .align 2 .global PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top @@ -121,245 +183,131 @@ _PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top: BarrettM .req w21 invN .req w22 invN_f .req w23 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 + table .req x1 counter .req x19 - ldrsh Q, [x2, #0] + ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] - ldr invN, [x2, #10] - ldr invN_f, [x2, #14] - - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 - - mov v0.H[0], Q - - dup v24.8H, Q - dup v25.8H, BarrettM - - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr invN, [x2, #10] + ldr invN_f, [x2, #14] + + mov v4.S[0], invN + mov v4.S[1], invN_f + + ldr q0, [table, #0*16] + mov v0.H[0], Q + + ldr q1, [table, #1*16] + ldr q2, [table, #2*16] + ldr q3, [table, #3*16] + + ldr q16, [src, # 8*32] + ldr q17, [src, # 9*32] + ldr q18, [src, #10*32] + ldr q19, [src, #11*32] + ldr q20, [src, #12*32] + ldr q21, [src, #13*32] + ldr q22, [src, #14*32] + ldr q23, [src, #15*32] + + qo_butterfly_botll \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q12, q13, q14, q15, \ + #4*32, #5*32, #6*32, #7*32 + + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_topsl \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #8*32, #9*32, #10*32, #11*32, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + qo_montgomery_mul_insl \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #12*32, #13*32, #14*32, #15*32, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_butterfly_botsl_mul \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32), \ + v12, v13, v14, v15, v24, v25, v26, v27, \ + v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0 + + str q12, [src, # 4*32] + ldr q12, [src, #(16+ 4*32)] + str q13, [src, # 5*32] + ldr q13, [src, #(16+ 5*32)] + str q14, [src, # 6*32] + ldr q14, [src, #(16+ 6*32)] + str q15, [src, # 7*32] + ldr q15, [src, #(16+ 7*32)] + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_tops \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + + qo_montgomery_mul_ins \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_montgomery_mul_ins \ + v12, v13, v14, v15, v24, v25, v26, v27, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32) + + str q12, [src, #(16+ 4*32)] + str q13, [src, #(16+ 5*32)] + str q14, [src, #(16+ 6*32)] + str q15, [src, #(16+ 7*32)] .unreq Q .unreq BarrettM .unreq invN .unreq invN_f - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr - - - - + ret diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_poly.S b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_poly.S index d3dcefc6f..175d01ab4 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_poly.S +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_KYBER512_AARCH64_asm_add_reduce -.global _PQCLEAN_KYBER512_AARCH64_asm_add_reduce -PQCLEAN_KYBER512_AARCH64_asm_add_reduce: -_PQCLEAN_KYBER512_AARCH64_asm_add_reduce: +.global PQCLEAN_KYBER512_AARCH64__asm_add_reduce +.global _PQCLEAN_KYBER512_AARCH64__asm_add_reduce +PQCLEAN_KYBER512_AARCH64__asm_add_reduce: +_PQCLEAN_KYBER512_AARCH64__asm_add_reduce: mov w4, #3329 mov w5, #25519 @@ -86,13 +89,13 @@ _PQCLEAN_KYBER512_AARCH64_asm_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER512_AARCH64_asm_sub_reduce -.global _PQCLEAN_KYBER512_AARCH64_asm_sub_reduce -PQCLEAN_KYBER512_AARCH64_asm_sub_reduce: -_PQCLEAN_KYBER512_AARCH64_asm_sub_reduce: +.global PQCLEAN_KYBER512_AARCH64__asm_sub_reduce +.global _PQCLEAN_KYBER512_AARCH64__asm_sub_reduce +PQCLEAN_KYBER512_AARCH64__asm_sub_reduce: +_PQCLEAN_KYBER512_AARCH64__asm_sub_reduce: mov w4, #3329 mov w5, #25519 @@ -147,13 +150,13 @@ _PQCLEAN_KYBER512_AARCH64_asm_sub_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce -.global _PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce -PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce: -_PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce: +.global PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce +.global _PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce +PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce: +_PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce: mov w4, #3329 mov w5, #25519 @@ -232,7 +235,7 @@ _PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v20.8H, v21.8H, v22.8H, v23.8H}, [x0], #64 - br lr + ret diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/api.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/api.h index 8f1010d60..36c00f706 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/api.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/api.h @@ -13,7 +13,7 @@ #define PQCLEAN_KYBER512_AARCH64_CRYPTO_PUBLICKEYBYTES 800 #define PQCLEAN_KYBER512_AARCH64_CRYPTO_CIPHERTEXTBYTES 768 #define PQCLEAN_KYBER512_AARCH64_CRYPTO_BYTES 32 -#define PQCLEAN_KYBER512_AARCH64_CRYPTO_ALGNAME "Kyber512" +#define PQCLEAN_KYBER512_AARCH64_CRYPTO_ALGNAME "Kyber512" int PQCLEAN_KYBER512_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/cbd.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/cbd.h index 47a06806e..2b3eb2afb 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/cbd.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/cbd.h @@ -1,5 +1,5 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER512_AARCH64_CBD_H +#define PQCLEAN_KYBER512_AARCH64_CBD_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "poly.h" -#include #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1) void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.c index bff6b3bf9..07b2c0719 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.h index f93487a37..25ae3a1ab 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/indcpa.h @@ -1,5 +1,5 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER512_AARCH64_INDCPA_H +#define PQCLEAN_KYBER512_AARCH64_INDCPA_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "polyvec.h" -#include #define gen_matrix KYBER_NAMESPACE(gen_matrix) void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.c index 670a4c599..572b5e93a 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.c @@ -8,12 +8,14 @@ #include #include #include + +#include "api.h" #include "params.h" +#include "kem.h" #include "indcpa.h" #include "verify.h" #include "symmetric.h" #include "randombytes.h" -#include "kem.h" /************************************************* * Name: crypto_kem_keypair_derand diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.h index 8b730b6c0..28ad70984 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/kem.h @@ -1,5 +1,5 @@ -#ifndef KEM_H -#define KEM_H +#ifndef PQCLEAN_KYBER512_AARCH64_KEM_H +#define PQCLEAN_KYBER512_AARCH64_KEM_H /* * This file is licensed @@ -7,13 +7,8 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include - -#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES -#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES -#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES -#define CRYPTO_BYTES KYBER_SSBYTES +#include "params.h" #define CRYPTO_ALGNAME "Kyber512" @@ -33,3 +28,4 @@ int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif + diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/macros.inc b/Modules/PQClean/crypto_kem/kyber512/aarch64/macros.inc index 2add309e0..5504405c1 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/macros.inc +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,11 +28,114 @@ * SOFTWARE. */ -#ifndef MACROS_S -#define MACROS_S - #include "macros_common.inc" +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + .macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H .endm @@ -52,16 +158,68 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + .macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm -.macro qo_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm -.macro qo_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H .endm @@ -69,18 +227,176 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + .macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm -.macro do_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H .endm -.macro do_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm -#endif +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== + +.macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 +.endm + +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm +// === 32-bit end ==== diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/macros_common.inc b/Modules/PQClean/crypto_kem/kyber512/aarch64/macros_common.inc index c1ac021cd..07568491d 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/macros_common.inc +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,31 +35,51 @@ .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -75,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -85,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -102,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -138,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -152,34 +310,340 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] mul \t4\wX, \b4\wX, \z4\nX[\h4] sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] mul \t5\wX, \b5\wX, \z5\nX[\h5] sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] mul \t6\wX, \b6\wX, \z6\nX[\h6] sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] mul \t7\wX, \b7\wX, \z7\nX[\h7] + ldr \d0, [\srcd_ptr, \memd0] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] add \a2\wX, \a2\wX, \t2\wX sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] add \a3\wX, \a3\wX, \t3\wX sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -221,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -231,7 +769,53 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D + +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX @@ -248,15 +832,77 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + ldr \c0, [\srcc_ptr, \memc0] mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] mul \t1\wX, \b1\wX, \h1\wX sub \b3\wX, \a3\wX, \t3\wX + ldr \c2, [\srcc_ptr, \memc2] sqrdmulh \b0\wX, \b0\wX, \l0\wX add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] sqrdmulh \b1\wX, \b1\wX, \l1\wX add \a3\wX, \a3\wX, \t3\wX @@ -269,53 +915,53 @@ .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -394,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_poly.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_poly.c index 1d9efe85d..1bb1fa7cd 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_poly.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_poly.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -129,14 +130,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER512_AARCH64_asm_add_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER512_AARCH64__asm_add_reduce(int16_t *, const int16_t *); void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER512_AARCH64_asm_add_reduce(c, a); + PQCLEAN_KYBER512_AARCH64__asm_add_reduce(c, a); } -extern void PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); +extern void PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) { - PQCLEAN_KYBER512_AARCH64_asm_add_add_reduce(c, a, b); + PQCLEAN_KYBER512_AARCH64__asm_add_add_reduce(c, a, b); } /************************************************* @@ -150,7 +151,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(int16_t *, const int16_t *); void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER512_AARCH64_asm_sub_reduce(c, a); + PQCLEAN_KYBER512_AARCH64__asm_sub_reduce(c, a); } diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_polyvec.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_polyvec.c index c05f59d66..8787fcde6 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_polyvec.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_polyvec.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -83,7 +84,7 @@ void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]) { * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]) { +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { // c = c + a; @@ -91,4 +92,3 @@ void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYB neon_poly_add_reduce(c[i], a[i]); } } - diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c index a5a2e7833..9a59724e7 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/neon_symmetric-shake.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -35,7 +36,7 @@ #include #include #include "params.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "symmetric.h" /************************************************* @@ -88,8 +89,8 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce1, uint8_t nonce2) { unsigned int i; - uint8_t extkey1[KYBER_SYMBYTES + 1 + 15]; - uint8_t extkey2[KYBER_SYMBYTES + 1 + 15]; + uint8_t extkey1[KYBER_SYMBYTES + 1]; + uint8_t extkey2[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey1[i] = key[i]; @@ -99,5 +100,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, extkey1[i] = nonce1; extkey2[i] = nonce2; - shake256x2(out1, out2, outlen, extkey1, extkey2, KYBER_SYMBYTES + 1); + shake256x2(out1, out2, outlen, extkey1, extkey2, sizeof(extkey1)); } diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.c index 8bca765e2..09583b73f 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.c @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,12 +28,35 @@ * SOFTWARE. */ -#include +#include #include "params.h" #include "ntt.h" -#include "reduce.h" #include "NTT_params.h" +const __attribute__ ((aligned (16)))int16_t asymmetric_const[8] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t constants[16] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, + invNQ1_R3modQ1_prime_half, + invNQ1_R3modQ1_doubleprime, + invNQ1_final_R3modQ1_prime_half, + invNQ1_final_R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { + 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 +}; + +const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { + 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 +}; + /************************************************* * Name: ntt * diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.h index bbf836109..795ebf3c2 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/ntt.h @@ -1,12 +1,15 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER512_AARCH64_NTT_H +#define PQCLEAN_KYBER512_AARCH64_NTT_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,12 +35,11 @@ #include "NTT_params.h" -extern const int16_t zetas[128]; - -#define ntt KYBER_NAMESPACE(ntt) -void ntt(int16_t r[256]); -#define invntt KYBER_NAMESPACE(invntt) -void invntt(int16_t r[256]); +#define asymmetric_const KYBER_NAMESPACE(asymmetric_const) +#define constants KYBER_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define pre_asymmetric_table_Q1_extended KYBER_NAMESPACE(pre_asymmetric_table_Q1_extended) +#define streamlined_inv_GS_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_inv_GS_negacyclic_table_Q1_jump_extended) extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); @@ -49,38 +51,33 @@ extern void PQCLEAN_KYBER512_AARCH64__asm_point_mul_extended(int16_t *, const in extern void PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); extern void PQCLEAN_KYBER512_AARCH64__asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); -static const int16_t asymmetric_const[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime -}; +extern +const int16_t asymmetric_const[8]; +extern +const int16_t constants[16]; -#define NTT(in) { \ - PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; -#define iNTT(in) { \ - PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +extern +const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; -static const int16_t constants[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, - invNQ1_R3modQ1_prime_half, - invNQ1_R3modQ1_doubleprime, - invNQ1_final_R3modQ1_prime_half, - invNQ1_final_R3modQ1_doubleprime -}; +extern +const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; -static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0 -}; +#define NTT(in) do { \ + PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER512_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) -static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { - 167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175 -}; +#define iNTT(in) do { \ + PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER512_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) -static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0 -}; +#define ntt KYBER_NAMESPACE(ntt) +void ntt(int16_t r[256]); +#define invntt KYBER_NAMESPACE(invntt) +void invntt(int16_t r[256]); #endif diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/params.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/params.h index 91d415bb0..f4ddb3a52 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/params.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_KYBER512_AARCH64_PARAMS_H +#define PQCLEAN_KYBER512_AARCH64_PARAMS_H /* * This file is licensed @@ -7,11 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -//#define KYBER_90S /* Uncomment this if you want the 90S variant */ - #define KYBER_NAMESPACE(s) PQCLEAN_KYBER512_AARCH64_##s -#define KYBER_K 2 +/* Don't change parameters below this line */ #define KYBER_N 256 #define KYBER_Q 3329 @@ -21,6 +19,7 @@ #define KYBER_POLYBYTES 384 #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) +#define KYBER_K 2 #define KYBER_ETA1 3 #define KYBER_POLYCOMPRESSEDBYTES 128 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.c index 7d5dbe66e..3cb9ecc48 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/kyber/blob/master/ref * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.h index 83c35067e..6ba67e59e 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER512_AARCH64_POLY_H +#define PQCLEAN_KYBER512_AARCH64_POLY_H /* * This file is licensed @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial @@ -30,7 +30,7 @@ void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const int16_t a[KYBER_N]); #define poly_frommsg KYBER_NAMESPACE(poly_frommsg) void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]); #define poly_tomsg KYBER_NAMESPACE(poly_tomsg) -void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t a[KYBER_N]); +void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t r[KYBER_N]); // NEON diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.c b/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.c index d495809ec..8930c9563 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.c +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.c @@ -19,7 +19,7 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; uint16_t t[4]; @@ -79,7 +79,7 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV * (needs space for KYBER_POLYVECBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { poly_tobytes(r + i * KYBER_POLYBYTES, a[i]); diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.h index 827610d63..3ff4c2e2b 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER512_AARCH64_POLYVEC_H +#define PQCLEAN_KYBER512_AARCH64_POLYVEC_H /* * This file was originally licensed @@ -7,8 +7,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -34,21 +35,21 @@ * SOFTWARE. */ +#include #include "params.h" #include "poly.h" -#include typedef struct { poly vec[KYBER_K]; } polyvec; #define polyvec_compress KYBER_NAMESPACE(polyvec_compress) -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress) void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes) -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes) void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECBYTES]); @@ -61,6 +62,6 @@ void neon_polyvec_ntt(int16_t r[KYBER_K][KYBER_N]); void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]); #define neon_polyvec_add_reduce KYBER_NAMESPACE(polyvec_add_reduce) -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]); +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]); #endif diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/reduce.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/reduce.h index 7d0f8e3bc..4266c7d74 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/reduce.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER512_AARCH64_REDUCE_H +#define PQCLEAN_KYBER512_AARCH64_REDUCE_H /* * This file is licensed @@ -7,8 +7,8 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" #define MONT (-1044) // 2^16 mod q #define QINV (-3327) // q^-1 mod 2^16 diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/rejsample.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/rejsample.h index ee9ae85c8..2442e01da 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/rejsample.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/rejsample.h @@ -1,5 +1,5 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H +#ifndef PQCLEAN_KYBER512_AARCH64_REJSAMPLE_H +#define PQCLEAN_KYBER512_AARCH64_REJSAMPLE_H /* * This file is licensed @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" #define neon_rej_uniform KYBER_NAMESPACE(_neon_rej_uniform) unsigned int neon_rej_uniform(int16_t *r, diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/symmetric.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/symmetric.h index cb9ea69e8..9019fd999 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/symmetric.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_KYBER512_AARCH64_SYMMETRIC_H +#define PQCLEAN_KYBER512_AARCH64_SYMMETRIC_H /* * This file is licensed @@ -8,9 +8,9 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include #include +#include "params.h" #include "fips202.h" @@ -37,7 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) // NEON Definition -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" typedef keccakx2_state neon_xof_state; diff --git a/Modules/PQClean/crypto_kem/kyber512/aarch64/verify.h b/Modules/PQClean/crypto_kem/kyber512/aarch64/verify.h index 3b9eca9f6..81b6525d0 100644 --- a/Modules/PQClean/crypto_kem/kyber512/aarch64/verify.h +++ b/Modules/PQClean/crypto_kem/kyber512/aarch64/verify.h @@ -1,5 +1,5 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER512_AARCH64_VERIFY_H +#define PQCLEAN_KYBER512_AARCH64_VERIFY_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include #include +#include "params.h" #define verify KYBER_NAMESPACE(verify) int verify(const uint8_t *a, const uint8_t *b, size_t len); diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/LICENSE b/Modules/PQClean/crypto_kem/kyber768/aarch64/LICENSE index 0e259d42c..093b0a7db 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/LICENSE +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/Makefile b/Modules/PQClean/crypto_kem/kyber768/aarch64/Makefile index e2d24a69d..aa78d6dab 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/Makefile +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/Makefile @@ -1,11 +1,15 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber768_aarch64.a -HEADERS=api.h cbd.h fips202x2.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h -OBJECTS=cbd.o fips202x2.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o feat.o +HEADERS=api.h cbd.h indcpa.h kem.h macros_common.inc macros.inc NTT_params.h ntt.h params.h poly.h polyvec.h reduce.h rejsample.h symmetric.h verify.h +OBJECTS=cbd.o indcpa.o kem.o neon_poly.o neon_polyvec.o neon_symmetric-shake.o ntt.o poly.o polyvec.o reduce.o rejsample.o symmetric-shake.o verify.o __asm_base_mul.o __asm_NTT.o __asm_iNTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) + all: $(LIB) %.o: %.c $(HEADERS) @@ -14,8 +18,11 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/NTT_params.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/NTT_params.h index d09348204..0d47ff96c 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/NTT_params.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/NTT_params.h @@ -1,8 +1,10 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_KYBER768_AARCH64_NTT_PARAMS_H +#define PQCLEAN_KYBER768_AARCH64_NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_NTT.S b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_NTT.S index eb7661643..fe9f9e82c 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_NTT.S +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,165 +36,188 @@ PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top: _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top: - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 + push_simd + Q .req w8 + src .req x0 + table .req x1 + counter .req x11 ldrsh Q, [x2, #0] - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 + ldr q0, [table, # 0*16] + ldr q1, [table, # 1*16] + ldr q2, [table, # 2*16] + ldr q3, [table, # 3*16] mov v0.H[0], Q - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_top v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v0, v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 - qo_butterfly_mixed v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v0, v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 - qo_butterfly_mixed v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_bot v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr q13, [src, # 9*32] + ldr q15, [src, #11*32] + ldr q17, [src, #13*32] + ldr q19, [src, #15*32] + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32 + + qo_butterfly_mixll \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32, \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixsls \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #1*32, #3*32, #5*32, #7*32, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #0*32, #2*32, #4*32, #6*32 + + qo_butterfly_botsls \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #9*32, #11*32, #13*32, #15*32, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #8*32, #10*32, #12*32, #14*32 + + qo_butterfly_topl \ + v13, v15, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) + + qo_butterfly_mixl \ + v5, v7, v9, v11, v13, v15, v17, v19, v28, v29, v30, v31, \ + v12, v14, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_mix \ + v4, v6, v8, v10, v12, v14, v16, v18, v20, v21, v22, v23, \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v0, \ + v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v5, v7, v13, v15, v9, v11, v17, v19, v28, v29, v30, v31, \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7 + + qo_butterfly_mix \ + v4, v6, v12, v14, v8, v10, v16, v18, v20, v21, v22, v23, \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v0, \ + v0, 4, 5, v0, 4, 5, v0, 6, 7, v0, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v5, v9, v13, v17, v7, v11, v15, v19, v28, v29, v30, v31, \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7 + + qo_butterfly_mix \ + v4, v8, v12, v16, v6, v10, v14, v18, v20, v21, v22, v23, \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v0, \ + v1, 0, 1, v1, 2, 3, v1, 4, 5, v1, 6, 7, \ + v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7 + + qo_butterfly_mixss \ + v4, v6, v8, v10, v5, v7, v9, v11, v28, v29, v30, v31, \ + v13, v15, v17, v19, v20, v21, v22, v23, \ + v0, \ + v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, \ + src, \ + q5, q7, q9, q11, \ + #(16+1*32), #(16+3*32), #(16+5*32), #(16+7*32), \ + src, \ + q4, q6, q8, q10, \ + #(16+0*32), #(16+2*32), #(16+4*32), #(16+6*32) + + qo_butterfly_botss \ + v12, v14, v16, v18, v13, v15, v17, v19, v20, v21, v22, v23, \ + src, \ + q13, q15, q17, q19, \ + #(16+9*32), #(16+11*32), #(16+13*32), #(16+15*32), \ + src, \ + q12, q14, q16, q18, \ + #(16+8*32), #(16+10*32), #(16+12*32), #(16+14*32) .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 + .unreq src .unreq table .unreq counter - pop_all - - br lr + pop_simd + ret .align 2 .global PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot @@ -199,13 +225,13 @@ _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top: PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: - push_all - Q .req w20 - BarrettM .req w21 + push_simd + Q .req w8 + BarrettM .req w9 src0 .req x0 src1 .req x1 - table .req x28 - counter .req x19 + table .req x10 + counter .req x11 ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] @@ -215,99 +241,127 @@ _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, # 1*16] + ldr q29, [src1, # 1*16] + ldr q30, [src0, # 3*16] + ldr q31, [src1, # 3*16] + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 mov counter, #3 _ntt_bot_loop: - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM - - do_butterfly_vec_top v25, v27, v29, v31, v18, v19, v0, v2, v3, v2, v3 - do_butterfly_vec_mixed v25, v27, v29, v31, v18, v19, v24, v26, v28, v30, v16, v17, v0, v2, v3, v2, v3, v2, v3, v2, v3 - do_butterfly_vec_mixed v24, v26, v28, v30, v16, v17, v25, v29, v27, v31, v18, v19, v0, v2, v3, v2, v3, v4, v5, v6, v7 - do_butterfly_vec_mixed v25, v29, v27, v31, v18, v19, v24, v28, v26, v30, v16, v17, v0, v4, v5, v6, v7, v4, v5, v6, v7 - do_butterfly_vec_mixed v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 - do_butterfly_vec_mixed v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 - do_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v0, v12, v13, v14, v15 - - oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v1, #11, v0 - - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + str q28, [src0, # 1*16] + ldr q28, [src0, #(64+1*16)] + str q29, [src1, # 1*16] + ldr q29, [src1, #(64+1*16)] + str q30, [src0, # 3*16] + ldr q30, [src0, #(64+3*16)] + str q31, [src1, # 3*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l3 v28, v29, v30, v31, v20, v21, v22, v23, table, q1, q2, q3, #1*16, #2*16, #3*16 + + do_butterfly_vec_top_2ltrn_4x4 \ + v29, v31, v18, v19, v0, v2, v3, v2, v3, \ + src0, src1, \ + q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16, \ + v24, v25, v26, v27, v20, v21, v22, v23 + + do_butterfly_vec_mixl \ + v25, v27, v29, v31, v18, v19, \ + v28, v30, v16, v17, \ + v0, \ + v2, v3, v2, v3, \ + table, \ + q4, q5, q6, q7, #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mixl \ + v24, v26, v28, v30, v16, v17, \ + v27, v31, v18, v19, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q8, q9, q10, q11, #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mixl \ + v25, v29, v27, v31, v18, v19, \ + v26, v30, v16, v17, \ + v0, \ + v4, v5, v6, v7, \ + table, \ + q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + add table, table, #256 + + do_butterfly_vec_mix v24, v28, v26, v30, v16, v17, v24, v26, v25, v27, v18, v19, v0, v4, v5, v6, v7, v8, v9, v10, v11 + + do_butterfly_vec_mix v24, v26, v25, v27, v18, v19, v28, v30, v29, v31, v16, v17, v0, v8, v9, v10, v11, v12, v13, v14, v15 + + do_butterfly_vec_bot_oo_barrett_trn_4x4 \ + v28, v30, v29, v31, v16, v17, \ + v24, v25, v26, v27, v20, v21, v22, v23, v28, v29, v30, v31, v16, v17, v18, v19, v0, #11, v0 + + trn_4x4_2s4 v28, v29, v30, v31, v16, v17, v18, v19, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + str q28, [src0, # 1*16] + str q29, [src1, # 1*16] + str q30, [src0, # 3*16] + str q31, [src1, # 3*16] + + add src0, src0, #64 + add src1, src1, #64 .unreq Q .unreq BarrettM @@ -315,12 +369,9 @@ _PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot: .unreq src1 .unreq table .unreq counter - pop_all - - br lr - - + pop_simd + ret diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_base_mul.S b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_base_mul.S index cc4636a6a..f9fed3d36 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_base_mul.S +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_base_mul.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,44 +47,195 @@ _PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended: ldrsh Q, [x3] - dup v20.8H, Q - - // TODO: unroll this, currently we are using only 16 SIMD registers - mov counter, #4 - _point_mul_extended_loop: - - ld2 { v0.8H, v1.8H}, [src1], #32 - ld2 { v2.8H, v3.8H}, [src1], #32 - ld2 { v4.8H, v5.8H}, [src1], #32 - ld2 { v6.8H, v7.8H}, [src1], #32 + dup v28.8H, Q - ld2 { v8.8H, v9.8H}, [src2ex], #32 - ld2 {v10.8H, v11.8H}, [src2ex], #32 - ld2 {v12.8H, v13.8H}, [src2ex], #32 - ld2 {v14.8H, v15.8H}, [src2ex], #32 + ldr q0, [src1, #0*16] + ldr q1, [src1, #1*16] + ldr q2, [src1, #2*16] + ldr q3, [src1, #3*16] + ldr q4, [src1, #4*16] + ldr q5, [src1, #5*16] + ldr q6, [src1, #6*16] + ldr q7, [src1, #7*16] + + add src1, src1, #8*16 + + uzp2 v1.8H, v0.8H, v1.8H + uzp2 v3.8H, v2.8H, v3.8H + uzp2 v5.8H, v4.8H, v5.8H + uzp2 v7.8H, v6.8H, v7.8H + + ldr q8, [src2ex, #0*16] + ldr q10, [src2ex, #2*16] + ldr q12, [src2ex, #4*16] + ldr q14, [src2ex, #6*16] + ldr q9, [src2ex, #1*16] + ldr q11, [src2ex, #3*16] + ldr q13, [src2ex, #5*16] + ldr q15, [src2ex, #7*16] + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] + sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] + sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] + sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] + sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] + mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] + mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] + mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] + mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q0, [src1, #0*16] + sqrdmulh v16.8H, v17.8H, v8.8H + ldr q1, [src1, #1*16] + sqrdmulh v18.8H, v19.8H, v10.8H + ldr q2, [src1, #2*16] + sqrdmulh v20.8H, v21.8H, v12.8H + ldr q3, [src1, #3*16] + sqrdmulh v22.8H, v23.8H, v14.8H + + ldr q4, [src1, #4*16] + mul v17.8H, v17.8H, v9.8H + uzp2 v1.8H, v0.8H, v1.8H + ldr q5, [src1, #5*16] + mul v19.8H, v19.8H, v11.8H + uzp2 v3.8H, v2.8H, v3.8H + ldr q6, [src1, #6*16] + mul v21.8H, v21.8H, v13.8H + uzp2 v5.8H, v4.8H, v5.8H + ldr q7, [src1, #7*16] + mul v23.8H, v23.8H, v15.8H + uzp2 v7.8H, v6.8H, v7.8H + + add src1, src1, #8*16 + + ldr q8, [src2ex, #0*16] + mls v17.8H, v16.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v19.8H, v18.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v21.8H, v20.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v23.8H, v22.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q17, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q19, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q21, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q23, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + ldr q16, [src1, #0*16] sqrdmulh v0.8H, v1.8H, v8.8H + ldr q17, [src1, #1*16] sqrdmulh v2.8H, v3.8H, v10.8H + ldr q18, [src1, #2*16] sqrdmulh v4.8H, v5.8H, v12.8H + ldr q19, [src1, #3*16] sqrdmulh v6.8H, v7.8H, v14.8H + ldr q20, [src1, #4*16] mul v1.8H, v1.8H, v9.8H + uzp2 v17.8H, v16.8H, v17.8H + ldr q21, [src1, #5*16] mul v3.8H, v3.8H, v11.8H + uzp2 v19.8H, v18.8H, v19.8H + ldr q22, [src1, #6*16] mul v5.8H, v5.8H, v13.8H + uzp2 v21.8H, v20.8H, v21.8H + ldr q23, [src1, #7*16] mul v7.8H, v7.8H, v15.8H + uzp2 v23.8H, v22.8H, v23.8H - mls v1.8H, v0.8H, v20.8H - mls v3.8H, v2.8H, v20.8H - mls v5.8H, v4.8H, v20.8H - mls v7.8H, v6.8H, v20.8H + add src1, src1, #8*16 - st1 { v1.8H}, [des], #16 - st1 { v3.8H}, [des], #16 - st1 { v5.8H}, [des], #16 - st1 { v7.8H}, [des], #16 + ldr q8, [src2ex, #0*16] + mls v1.8H, v0.8H, v28.8H + ldr q10, [src2ex, #2*16] + mls v3.8H, v2.8H, v28.8H + ldr q12, [src2ex, #4*16] + mls v5.8H, v4.8H, v28.8H + ldr q14, [src2ex, #6*16] + mls v7.8H, v6.8H, v28.8H + + ldr q9, [src2ex, #1*16] + str q1, [des, #0*16] + ldr q11, [src2ex, #3*16] + str q3, [des, #1*16] + ldr q13, [src2ex, #5*16] + str q5, [des, #2*16] + ldr q15, [src2ex, #7*16] + str q7, [des, #3*16] + + add des, des, #4*16 + + add src2ex, src2ex, #8*16 + + sqrdmulh v16.8H, v17.8H, v8.8H + sqrdmulh v18.8H, v19.8H, v10.8H + sqrdmulh v20.8H, v21.8H, v12.8H + sqrdmulh v22.8H, v23.8H, v14.8H + + mul v17.8H, v17.8H, v9.8H + mul v19.8H, v19.8H, v11.8H + mul v21.8H, v21.8H, v13.8H + mul v23.8H, v23.8H, v15.8H + + mls v17.8H, v16.8H, v28.8H + mls v19.8H, v18.8H, v28.8H + mls v21.8H, v20.8H, v28.8H + mls v23.8H, v22.8H, v28.8H + + str q17, [des, #0*16] + str q19, [des, #1*16] + str q21, [des, #2*16] + str q23, [des, #3*16] + + add des, des, #4*16 - sub counter, counter, #1 - cbnz counter, _point_mul_extended_loop .unreq Q .unreq des @@ -90,7 +244,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended: .unreq counter pop_all - br lr + ret .align 2 @@ -100,8 +254,6 @@ PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: push_all - Q .req w28 - Qprime2 .req w27 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -117,8 +269,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] + ldr s4, [x3] add des, x4, #0 @@ -138,94 +289,294 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: add src2asy_3, src2asy_0, #256*3 #endif - dup v28.8H, Q - dup v29.8H, Qprime2 + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - // TODO:interleaving - mov counter, #16 - _asymmetric_mul_loop: + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + // TODO:interleaving + mov counter, #15 + _asymmetric_mul_loop: + + ldr q20, [src1_0, #0*16] + uzp1 v6.8H, v16.8H, v18.8H + ldr q21, [src1_0, #1*16] + uzp1 v7.8H, v17.8H, v19.8H + + ldr q22, [src2_0, #0*16] + mul v6.8H, v6.8H, v4.H[1] + ldr q23, [src2_0, #1*16] + mul v7.8H, v7.8H, v4.H[1] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + smlal v16.4S, v6.4H, v4.H[0] + uzp1 v0.8H, v20.8H, v21.8H + smlal2 v18.4S, v6.8H, v4.H[0] + uzp2 v1.8H, v20.8H, v21.8H + smlal v17.4S, v7.4H, v4.H[0] + uzp1 v2.8H, v22.8H, v23.8H + smlal2 v19.4S, v7.8H, v4.H[0] + uzp2 v3.8H, v22.8H, v23.8H + + ld1 {v28.8H}, [src2asy_0], #16 + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H +#endif + +#else - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - st2 {v24.8H, v25.8H}, [des], #32 +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_loop - .unreq Q - .unreq Qprime2 + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -242,7 +593,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul: .unreq counter pop_all - br lr + ret .align 2 @@ -252,10 +603,6 @@ PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: push_all - Q .req w28 - Qprime2 .req w27 - R3 .req w26 - R3p .req w25 des .req x11 src1_0 .req x0 src2_0 .req x1 @@ -271,11 +618,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: src2asy_3 .req x14 counter .req x19 - ldrsh Q, [x3, #0] - ldrsh Qprime2, [x3, #2] - - ldrsh R3, [x3, #8] - ldrsh R3p, [x3, #10] + ldr q4, [x3] add des, x4, #0 @@ -295,108 +638,312 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: add src2asy_3, src2asy_0, #256*3 #endif - dup v26.8H, R3 - dup v27.8H, R3p + ldr q20, [src1_0, #0*16] + ldr q21, [src1_0, #1*16] + ldr q22, [src2_0, #0*16] + ldr q23, [src2_0, #1*16] - dup v28.8H, Q - dup v29.8H, Qprime2 + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 - // TODO: interleaving - mov counter, #16 - _asymmetric_mul_montgomery_loop: + uzp1 v0.8H, v20.8H, v21.8H + uzp2 v1.8H, v20.8H, v21.8H + uzp1 v2.8H, v22.8H, v23.8H + uzp2 v3.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_0], #32 - ld2 { v2.8H, v3.8H}, [ src2_0], #32 - ld1 { v5.8H}, [src2asy_0], #16 + ld1 {v28.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H - smull2 v20.4S, v0.8H, v2.8H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] smull v17.4S, v0.4H, v3.4H - smull2 v21.4S, v0.8H, v3.8H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H - ld2 { v0.8H, v1.8H}, [ src1_1], #32 - ld2 { v2.8H, v3.8H}, [ src2_1], #32 - ld1 { v5.8H}, [src2asy_1], #16 + ld1 {v29.8H}, [src2asy_1], #16 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H +#if KYBER_K > 2 - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 -#if KYBER_K > 2 - ld2 { v0.8H, v1.8H}, [ src1_2], #32 - ld2 { v2.8H, v3.8H}, [ src2_2], #32 - ld1 { v5.8H}, [src2asy_2], #16 +#if KYBER_K > 3 - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif -#if KYBER_K > 3 - ld2 { v0.8H, v1.8H}, [ src1_3], #32 - ld2 { v2.8H, v3.8H}, [ src2_3], #32 - ld1 { v5.8H}, [src2asy_3], #16 +#else - smlal v16.4S, v0.4H, v2.4H - smlal2 v20.4S, v0.8H, v2.8H - smlal v17.4S, v0.4H, v3.4H - smlal2 v21.4S, v0.8H, v3.8H + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H - smlal v16.4S, v1.4H, v5.4H - smlal2 v20.4S, v1.8H, v5.8H - smlal v17.4S, v1.4H, v2.4H - smlal2 v21.4S, v1.8H, v2.8H #endif - uzp1 v0.8H, v16.8H, v20.8H - uzp1 v1.8H, v17.8H, v21.8H + mov counter, #15 + _asymmetric_mul_montgomery_loop: + + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + ldr q20, [src1_0, #0*16] + smlal v16.4S, v6.4H, v4.H[0] + ldr q21, [src1_0, #1*16] + smlal2 v18.4S, v6.8H, v4.H[0] + ldr q22, [src2_0, #0*16] + smlal v17.4S, v7.4H, v4.H[0] + ldr q23, [src2_0, #1*16] + smlal2 v19.4S, v7.8H, v4.H[0] + + add src1_0, src1_0, #32 + add src2_0, src2_0, #32 + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H - mul v0.8H, v0.8H, v29.8H - mul v1.8H, v1.8H, v29.8H + uzp1 v0.8H, v20.8H, v21.8H + sqrdmulh v16.8H, v6.8H, v4.H[4] + uzp2 v1.8H, v20.8H, v21.8H + sqrdmulh v17.8H, v7.8H, v4.H[4] - smlal v16.4S, v0.4H, v28.4H - smlal2 v20.4S, v0.8H, v28.8H - smlal v17.4S, v1.4H, v28.4H - smlal2 v21.4S, v1.8H, v28.8H + uzp1 v2.8H, v22.8H, v23.8H + mul v6.8H, v6.8H, v4.H[5] + uzp2 v3.8H, v22.8H, v23.8H + mul v7.8H, v7.8H, v4.H[5] - uzp2 v24.8H, v16.8H, v20.8H - uzp2 v25.8H, v17.8H, v21.8H + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] - sqrdmulh v16.8H, v24.8H, v26.8H - sqrdmulh v17.8H, v25.8H, v26.8H + st2 { v6.8H, v7.8H}, [des], #32 - mul v24.8H, v24.8H, v27.8H - mul v25.8H, v25.8H, v27.8H + ld1 {v28.8H}, [src2asy_0], #16 - mls v24.8H, v16.8H, v28.8H - mls v25.8H, v17.8H, v28.8H + smull v16.4S, v0.4H, v2.4H + ldr q20, [src1_1, #0*16] + smull2 v18.4S, v0.8H, v2.8H + ldr q21, [src1_1, #1*16] + smull v17.4S, v0.4H, v3.4H + ldr q22, [src2_1, #0*16] + smull2 v19.4S, v0.8H, v3.8H + ldr q23, [src2_1, #1*16] + + add src1_1, src1_1, #32 + add src2_1, src2_1, #32 + + smlal v16.4S, v1.4H, v28.4H + uzp1 v8.8H, v20.8H, v21.8H + smlal2 v18.4S, v1.8H, v28.8H + uzp2 v9.8H, v20.8H, v21.8H + smlal v17.4S, v1.4H, v2.4H + uzp1 v10.8H, v22.8H, v23.8H + smlal2 v19.4S, v1.8H, v2.8H + uzp2 v11.8H, v22.8H, v23.8H + + ld1 {v29.8H}, [src2asy_1], #16 + +#if KYBER_K > 2 + + smlal v16.4S, v8.4H, v10.4H + ldr q20, [src1_2, #0*16] + smlal2 v18.4S, v8.8H, v10.8H + ldr q21, [src1_2, #1*16] + smlal v17.4S, v8.4H, v11.4H + ldr q22, [src2_2, #0*16] + smlal2 v19.4S, v8.8H, v11.8H + ldr q23, [src2_2, #1*16] + + add src1_2, src1_2, #32 + add src2_2, src2_2, #32 + + smlal v16.4S, v9.4H, v29.4H + uzp1 v12.8H, v20.8H, v21.8H + smlal2 v18.4S, v9.8H, v29.8H + uzp2 v13.8H, v20.8H, v21.8H + smlal v17.4S, v9.4H, v10.4H + uzp1 v14.8H, v22.8H, v23.8H + smlal2 v19.4S, v9.8H, v10.8H + uzp2 v15.8H, v22.8H, v23.8H + + ld1 {v30.8H}, [src2asy_2], #16 + +#if KYBER_K > 3 + + smlal v16.4S, v12.4H, v14.4H + ldr q20, [src1_3, #0*16] + smlal2 v18.4S, v12.8H, v14.8H + ldr q21, [src1_3, #1*16] + smlal v17.4S, v12.4H, v15.4H + ldr q22, [src2_3, #0*16] + smlal2 v19.4S, v12.8H, v15.8H + ldr q23, [src2_3, #1*16] + + add src1_3, src1_3, #32 + add src2_3, src2_3, #32 + + smlal v16.4S, v13.4H, v30.4H + uzp1 v24.8H, v20.8H, v21.8H + smlal2 v18.4S, v13.8H, v30.8H + uzp2 v25.8H, v20.8H, v21.8H + smlal v17.4S, v13.4H, v14.4H + uzp1 v26.8H, v22.8H, v23.8H + smlal2 v19.4S, v13.8H, v14.8H + uzp2 v27.8H, v22.8H, v23.8H + + ld1 {v31.8H}, [src2asy_3], #16 + + smlal v16.4S, v24.4H, v26.4H + smlal2 v18.4S, v24.8H, v26.8H + smlal v17.4S, v24.4H, v27.4H + smlal2 v19.4S, v24.8H, v27.8H + + smlal v16.4S, v25.4H, v31.4H + smlal2 v18.4S, v25.8H, v31.8H + smlal v17.4S, v25.4H, v26.4H + smlal2 v19.4S, v25.8H, v26.8H + +#else + + smlal v16.4S, v12.4H, v14.4H + smlal2 v18.4S, v12.8H, v14.8H + smlal v17.4S, v12.4H, v15.4H + smlal2 v19.4S, v12.8H, v15.8H + + smlal v16.4S, v13.4H, v30.4H + smlal2 v18.4S, v13.8H, v30.8H + smlal v17.4S, v13.4H, v14.4H + smlal2 v19.4S, v13.8H, v14.8H + +#endif - st2 {v24.8H, v25.8H}, [des], #32 +#else + + smlal v16.4S, v8.4H, v10.4H + smlal2 v18.4S, v8.8H, v10.8H + smlal v17.4S, v8.4H, v11.4H + smlal2 v19.4S, v8.8H, v11.8H + + smlal v16.4S, v9.4H, v29.4H + smlal2 v18.4S, v9.8H, v29.8H + smlal v17.4S, v9.4H, v10.4H + smlal2 v19.4S, v9.8H, v10.8H + +#endif sub counter, counter, #1 cbnz counter, _asymmetric_mul_montgomery_loop - .unreq Q - .unreq Qprime2 - .unreq R3 - .unreq R3p + uzp1 v6.8H, v16.8H, v18.8H + uzp1 v7.8H, v17.8H, v19.8H + + mul v6.8H, v6.8H, v4.H[1] + mul v7.8H, v7.8H, v4.H[1] + + smlal v16.4S, v6.4H, v4.H[0] + smlal2 v18.4S, v6.8H, v4.H[0] + smlal v17.4S, v7.4H, v4.H[0] + smlal2 v19.4S, v7.8H, v4.H[0] + + uzp2 v6.8H, v16.8H, v18.8H + uzp2 v7.8H, v17.8H, v19.8H + + sqrdmulh v16.8H, v6.8H, v4.H[4] + sqrdmulh v17.8H, v7.8H, v4.H[4] + + mul v6.8H, v6.8H, v4.H[5] + mul v7.8H, v7.8H, v4.H[5] + + mls v6.8H, v16.8H, v4.H[0] + mls v7.8H, v17.8H, v4.H[0] + + st2 { v6.8H, v7.8H}, [des], #32 + .unreq des .unreq src1_0 .unreq src2_0 @@ -413,7 +960,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery: .unreq counter pop_all - br lr + ret diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_iNTT.S b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_iNTT.S index 7ddb59251..f3e798232 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_iNTT.S +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -49,57 +52,116 @@ _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot: add src0, x0, #256*0 add src1, x0, #256*1 - mov counter, #4 + mov v0.H[0], Q + mov v0.H[1], BarrettM + + ldr q28, [src0, #1*16] + ldr q29, [src1, #1*16] + ldr q30, [src0, #3*16] + ldr q31, [src1, #3*16] + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 + + do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 + + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 + + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 + + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 + + mov counter, #3 _intt_bot_loop: - ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0] - ld4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1] - - trn1 v24.4S, v16.4S, v20.4S - ld2 { v0.8H, v1.8H}, [table], #32 - trn2 v28.4S, v16.4S, v20.4S - ld2 { v2.8H, v3.8H}, [table], #32 - trn1 v25.4S, v17.4S, v21.4S - ld2 { v4.8H, v5.8H}, [table], #32 - trn2 v29.4S, v17.4S, v21.4S - ld2 { v6.8H, v7.8H}, [table], #32 - trn1 v26.4S, v18.4S, v22.4S - ld2 { v8.8H, v9.8H}, [table], #32 - trn2 v30.4S, v18.4S, v22.4S - ld2 {v10.8H, v11.8H}, [table], #32 - trn1 v27.4S, v19.4S, v23.4S - ld2 {v12.8H, v13.8H}, [table], #32 - trn2 v31.4S, v19.4S, v23.4S - ld2 {v14.8H, v15.8H}, [table], #32 - - dup v0.8H, Q - mov v1.H[0], BarrettM + str q24, [src0, #0*16] + ldr q28, [src0, #(64+1*16)] + str q25, [src1, #0*16] + ldr q29, [src1, #(64+1*16)] + str q26, [src0, #2*16] + ldr q30, [src0, #(64+3*16)] + str q27, [src1, #2*16] + ldr q31, [src1, #(64+3*16)] + + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_2l4 v28, v29, v30, v31, v20, v21, v22, v23, src0, src1, q24, q25, q26, q27, #0*16, #0*16, #2*16, #2*16 + + trn_4x4_2l4 v24, v25, v26, v27, v20, v21, v22, v23, table, table, q12, q13, q14, q15, #12*16, #13*16, #14*16, #15*16 do_butterfly_vec_bot v28, v30, v18, v19, v29, v31, v0, v12, v13, v14, v15 - do_butterfly_vec_mixed_rev v28, v30, v18, v19, v29, v31, v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, v8, v9, v10, v11 - do_butterfly_vec_mixed_rev v24, v26, v16, v17, v25, v27, v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, v6, v7, v6, v7 - do_butterfly_vec_mixed_rev v28, v29, v18, v19, v30, v31, v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, v4, v5, v4, v5 - do_butterfly_vec_mixed_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 - do_butterfly_vec_mixed_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 + + do_butterfly_vec_mix_rev_l4 \ + v18, v19, v29, v31, \ + v24, v26, v16, v17, v25, v27, v0, v12, v13, v14, v15, \ + table, \ + q8, q9, q10, q11, \ + #8*16, #9*16, #10*16, #11*16 + + do_butterfly_vec_mix_rev_l4 \ + v16, v17, v25, v27, \ + v28, v29, v18, v19, v30, v31, v0, v8, v9, v10, v11, \ + table, \ + q4, q5, q6, q7, \ + #4*16, #5*16, #6*16, #7*16 + + do_butterfly_vec_mix_rev_l3 \ + v18, v19, v30, v31, \ + v24, v25, v16, v17, v26, v27, v0, v6, v7, v6, v7, \ + table, \ + q1, q2, q3, \ + #1*16, #2*16, #3*16 + + do_butterfly_vec_mix_rev v24, v25, v16, v17, v26, v27, v24, v25, v18, v19, v28, v29, v0, v4, v5, v4, v5, v2, v3, v2, v3 + do_butterfly_vec_mix_rev v24, v25, v18, v19, v28, v29, v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3, v2, v3, v2, v3 do_butterfly_vec_top v26, v27, v16, v17, v30, v31, v0, v2, v3, v2, v3 - qo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v1, #11, v0 + oo_barrett v24, v25, v26, v27, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, #11, v0 + + add table, table, #256 - trn1 v16.4S, v24.4S, v28.4S - trn2 v20.4S, v24.4S, v28.4S - trn1 v17.4S, v25.4S, v29.4S - trn2 v21.4S, v25.4S, v29.4S - trn1 v18.4S, v26.4S, v30.4S - trn2 v22.4S, v26.4S, v30.4S - trn1 v19.4S, v27.4S, v31.4S - trn2 v23.4S, v27.4S, v31.4S + trn_4x4 v28, v29, v30, v31, v16, v17, v18, v19 - st4 {v16.4S, v17.4S, v18.4S, v19.4S}, [src0], #64 - st4 {v20.4S, v21.4S, v22.4S, v23.4S}, [src1], #64 + trn_4x4_2s4 v24, v25, v26, v27, v16, v17, v18, v19, src0, src1, q28, q29, q30, q31, #1*16, #1*16, #3*16, #3*16 sub counter, counter, #1 cbnz counter, _intt_bot_loop + str q24, [src0, #0*16] + str q25, [src1, #0*16] + str q26, [src0, #2*16] + str q27, [src1, #2*16] + + .unreq Q .unreq BarrettM .unreq src0 @@ -108,7 +170,7 @@ _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot: .unreq counter pop_all - br lr + ret .align 2 .global PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top @@ -121,245 +183,131 @@ _PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top: BarrettM .req w21 invN .req w22 invN_f .req w23 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 + table .req x1 counter .req x19 - ldrsh Q, [x2, #0] + ldrsh Q, [x2, #0] ldrsh BarrettM, [x2, #8] - ldr invN, [x2, #10] - ldr invN_f, [x2, #14] - - mov table, x1 - - add src0, x0, #32*0 - add src1, x0, #32*1 - add src2, x0, #32*2 - add src3, x0, #32*3 - add src4, x0, #32*4 - add src5, x0, #32*5 - add src6, x0, #32*6 - add src7, x0, #32*7 - add src8, x0, #32*8 - add src9, x0, #32*9 - add src10, x0, #32*10 - add src11, x0, #32*11 - add src12, x0, #32*12 - add src13, x0, #32*13 - add src14, x0, #32*14 - add src15, x0, #32*15 - - ld1 { v0.8H, v1.8H, v2.8H, v3.8H}, [table], #64 - - mov v0.H[0], Q - - dup v24.8H, Q - dup v25.8H, BarrettM - - ld1 { v4.8H}, [ src0] - ld1 { v5.8H}, [ src1] - ld1 { v6.8H}, [ src2] - ld1 { v7.8H}, [ src3] - ld1 { v8.8H}, [ src4] - ld1 { v9.8H}, [ src5] - ld1 {v10.8H}, [ src6] - ld1 {v11.8H}, [ src7] - - ld1 {v12.8H}, [ src8] - ld1 {v13.8H}, [ src9] - ld1 {v14.8H}, [src10] - ld1 {v15.8H}, [src11] - ld1 {v16.8H}, [src12] - ld1 {v17.8H}, [src13] - ld1 {v18.8H}, [src14] - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - ld1 { v4.8H}, [ src0] - st1 { v5.8H}, [ src1], #16 - ld1 { v5.8H}, [ src1] - st1 { v6.8H}, [ src2], #16 - ld1 { v6.8H}, [ src2] - st1 { v7.8H}, [ src3], #16 - ld1 { v7.8H}, [ src3] - st1 { v8.8H}, [ src4], #16 - ld1 { v8.8H}, [ src4] - st1 { v9.8H}, [ src5], #16 - ld1 { v9.8H}, [ src5] - st1 {v10.8H}, [ src6], #16 - ld1 {v10.8H}, [ src6] - st1 {v11.8H}, [ src7], #16 - ld1 {v11.8H}, [ src7] - - st1 {v12.8H}, [ src8], #16 - ld1 {v12.8H}, [ src8] - st1 {v13.8H}, [ src9], #16 - ld1 {v13.8H}, [ src9] - st1 {v14.8H}, [src10], #16 - ld1 {v14.8H}, [src10] - st1 {v15.8H}, [src11], #16 - ld1 {v15.8H}, [src11] - st1 {v16.8H}, [src12], #16 - ld1 {v16.8H}, [src12] - st1 {v17.8H}, [src13], #16 - ld1 {v17.8H}, [src13] - st1 {v18.8H}, [src14], #16 - ld1 {v18.8H}, [src14] - st1 {v19.8H}, [src15], #16 - ld1 {v19.8H}, [src15] - - qo_butterfly_bot v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v12, v14, v16, v18, v28, v29, v30, v31, v13, v15, v17, v19, v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 - qo_butterfly_mixed_rev v4, v6, v8, v10, v20, v21, v22, v23, v5, v7, v9, v11, v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 - qo_butterfly_mixed_rev v12, v13, v16, v17, v28, v29, v30, v31, v14, v15, v18, v19, v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 - qo_butterfly_mixed_rev v4, v5, v8, v9, v20, v21, v22, v23, v6, v7, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 - qo_butterfly_mixed_rev v12, v13, v14, v15, v28, v29, v30, v31, v16, v17, v18, v19, v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - qo_butterfly_top v4, v5, v6, v7, v20, v21, v22, v23, v8, v9, v10, v11, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 - - qo_barrett_vec v4, v5, v12, v13, v20, v21, v22, v23, v25, #11, v24 - - mov v0.S[1], invN_f - - qo_butterfly_bot v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_mixed_rev v4, v5, v6, v7, v28, v29, v30, v31, v12, v13, v14, v15, v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - qo_butterfly_top v8, v9, v10, v11, v20, v21, v22, v23, v16, v17, v18, v19, v0, v0, 2, 3, v0, 2, 3, v0, 2, 3, v0, 2, 3 - - mov v0.S[1], invN - - sqrdmulh v28.8H, v4.8H, v0.H[2] - sqrdmulh v29.8H, v5.8H, v0.H[2] - sqrdmulh v30.8H, v6.8H, v0.H[2] - sqrdmulh v31.8H, v7.8H, v0.H[2] - sqrdmulh v20.8H, v8.8H, v0.H[2] - sqrdmulh v21.8H, v9.8H, v0.H[2] - sqrdmulh v22.8H, v10.8H, v0.H[2] - sqrdmulh v23.8H, v11.8H, v0.H[2] - - mul v4.8H, v4.8H, v0.H[3] - mul v5.8H, v5.8H, v0.H[3] - mul v6.8H, v6.8H, v0.H[3] - mul v7.8H, v7.8H, v0.H[3] - mul v8.8H, v8.8H, v0.H[3] - mul v9.8H, v9.8H, v0.H[3] - mul v10.8H, v10.8H, v0.H[3] - mul v11.8H, v11.8H, v0.H[3] - - mls v4.8H, v28.8H, v0.H[0] - mls v5.8H, v29.8H, v0.H[0] - mls v6.8H, v30.8H, v0.H[0] - mls v7.8H, v31.8H, v0.H[0] - mls v8.8H, v20.8H, v0.H[0] - mls v9.8H, v21.8H, v0.H[0] - mls v10.8H, v22.8H, v0.H[0] - mls v11.8H, v23.8H, v0.H[0] - - st1 { v4.8H}, [ src0], #16 - st1 { v5.8H}, [ src1], #16 - st1 { v6.8H}, [ src2], #16 - st1 { v7.8H}, [ src3], #16 - st1 { v8.8H}, [ src4], #16 - st1 { v9.8H}, [ src5], #16 - st1 {v10.8H}, [ src6], #16 - st1 {v11.8H}, [ src7], #16 - - st1 {v12.8H}, [ src8], #16 - st1 {v13.8H}, [ src9], #16 - st1 {v14.8H}, [src10], #16 - st1 {v15.8H}, [src11], #16 - st1 {v16.8H}, [src12], #16 - st1 {v17.8H}, [src13], #16 - st1 {v18.8H}, [src14], #16 - st1 {v19.8H}, [src15], #16 + ldr invN, [x2, #10] + ldr invN_f, [x2, #14] + + mov v4.S[0], invN + mov v4.S[1], invN_f + + ldr q0, [table, #0*16] + mov v0.H[0], Q + + ldr q1, [table, #1*16] + ldr q2, [table, #2*16] + ldr q3, [table, #3*16] + + ldr q16, [src, # 8*32] + ldr q17, [src, # 9*32] + ldr q18, [src, #10*32] + ldr q19, [src, #11*32] + ldr q20, [src, #12*32] + ldr q21, [src, #13*32] + ldr q22, [src, #14*32] + ldr q23, [src, #15*32] + + qo_butterfly_botll \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q12, q13, q14, q15, \ + #4*32, #5*32, #6*32, #7*32 + + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_topsl \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #8*32, #9*32, #10*32, #11*32, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + qo_montgomery_mul_insl \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #12*32, #13*32, #14*32, #15*32, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_butterfly_botsl_mul \ + v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, \ + src, \ + q8, q9, q10, q11, \ + #0*32, #1*32, #2*32, #3*32, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32), \ + v12, v13, v14, v15, v24, v25, v26, v27, \ + v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0 + + str q12, [src, # 4*32] + ldr q12, [src, #(16+ 4*32)] + str q13, [src, # 5*32] + ldr q13, [src, #(16+ 5*32)] + str q14, [src, # 6*32] + ldr q14, [src, #(16+ 6*32)] + str q15, [src, # 7*32] + ldr q15, [src, #(16+ 7*32)] + + qo_butterfly_mix_rev v16, v18, v20, v22, v28, v29, v30, v31, v17, v19, v21, v23, v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v0, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7, v3, 0, 1, v3, 2, 3, v3, 4, 5, v3, 6, 7 + qo_butterfly_mix_rev v8, v10, v12, v14, v24, v25, v26, v27, v9, v11, v13, v15, v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v0, v2, 0, 1, v2, 2, 3, v2, 4, 5, v2, 6, 7, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7 + qo_butterfly_mix_rev v16, v17, v20, v21, v28, v29, v30, v31, v18, v19, v22, v23, v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v0, v1, 4, 5, v1, 4, 5, v1, 6, 7, v1, 6, 7, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3 + qo_butterfly_mix_rev v8, v9, v12, v13, v24, v25, v26, v27, v10, v11, v14, v15, v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v0, v1, 0, 1, v1, 0, 1, v1, 2, 3, v1, 2, 3, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7 + qo_butterfly_mix_rev v16, v17, v18, v19, v28, v29, v30, v31, v20, v21, v22, v23, v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v0, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 6, 7, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5 + qo_butterfly_mix_rev v8, v9, v10, v11, v24, v25, v26, v27, v12, v13, v14, v15, v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v0, v0, 4, 5, v0, 4, 5, v0, 4, 5, v0, 4, 5, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + qo_butterfly_mix_rev v8, v9, v10, v11, v28, v29, v30, v31, v16, v17, v18, v19, v12, v13, v14, v15, v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3 + + qo_butterfly_tops \ + v24, v25, v26, v27, v20, v21, v22, v23, v0, v4, 2, 3, v4, 2, 3, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(16+8*32), #(16+9*32), #(16+10*32), #(16+11*32) + + + qo_montgomery_mul_ins \ + v8, v9, v10, v11, v28, v29, v30, v31, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q20, q21, q22, q23, \ + #(16+12*32), #(16+13*32), #(16+14*32), #(16+15*32) + + qo_montgomery_mul_ins \ + v12, v13, v14, v15, v24, v25, v26, v27, v0, v4, 1, 0, v4, 1, 0, v4, 1, 0, v4, 1, 0, \ + src, \ + q8, q9, q10, q11, \ + #(16+0*32), #(16+1*32), #(16+2*32), #(16+3*32) + + str q12, [src, #(16+ 4*32)] + str q13, [src, #(16+ 5*32)] + str q14, [src, #(16+ 6*32)] + str q15, [src, #(16+ 7*32)] .unreq Q .unreq BarrettM .unreq invN .unreq invN_f - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr - - - - + ret diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_poly.S b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_poly.S index b934f8787..be524b33c 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_poly.S +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_KYBER768_AARCH64_asm_add_reduce -.global _PQCLEAN_KYBER768_AARCH64_asm_add_reduce -PQCLEAN_KYBER768_AARCH64_asm_add_reduce: -_PQCLEAN_KYBER768_AARCH64_asm_add_reduce: +.global PQCLEAN_KYBER768_AARCH64__asm_add_reduce +.global _PQCLEAN_KYBER768_AARCH64__asm_add_reduce +PQCLEAN_KYBER768_AARCH64__asm_add_reduce: +_PQCLEAN_KYBER768_AARCH64__asm_add_reduce: mov w4, #3329 mov w5, #25519 @@ -86,13 +89,13 @@ _PQCLEAN_KYBER768_AARCH64_asm_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER768_AARCH64_asm_sub_reduce -.global _PQCLEAN_KYBER768_AARCH64_asm_sub_reduce -PQCLEAN_KYBER768_AARCH64_asm_sub_reduce: -_PQCLEAN_KYBER768_AARCH64_asm_sub_reduce: +.global PQCLEAN_KYBER768_AARCH64__asm_sub_reduce +.global _PQCLEAN_KYBER768_AARCH64__asm_sub_reduce +PQCLEAN_KYBER768_AARCH64__asm_sub_reduce: +_PQCLEAN_KYBER768_AARCH64__asm_sub_reduce: mov w4, #3329 mov w5, #25519 @@ -147,13 +150,13 @@ _PQCLEAN_KYBER768_AARCH64_asm_sub_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v16.8H, v17.8H, v18.8H, v19.8H}, [x0], #64 - br lr + ret .align 2 -.global PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce -.global _PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce -PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce: -_PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce: +.global PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce +.global _PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce +PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce: +_PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce: mov w4, #3329 mov w5, #25519 @@ -232,7 +235,7 @@ _PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce: st1 { v4.8H, v5.8H, v6.8H, v7.8H}, [x0], #64 st1 {v20.8H, v21.8H, v22.8H, v23.8H}, [x0], #64 - br lr + ret diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/api.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/api.h index bb4877adf..eab10900d 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/api.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/api.h @@ -13,7 +13,7 @@ #define PQCLEAN_KYBER768_AARCH64_CRYPTO_PUBLICKEYBYTES 1184 #define PQCLEAN_KYBER768_AARCH64_CRYPTO_CIPHERTEXTBYTES 1088 #define PQCLEAN_KYBER768_AARCH64_CRYPTO_BYTES 32 -#define PQCLEAN_KYBER768_AARCH64_CRYPTO_ALGNAME "Kyber768" +#define PQCLEAN_KYBER768_AARCH64_CRYPTO_ALGNAME "Kyber768" int PQCLEAN_KYBER768_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/cbd.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/cbd.h index 47a06806e..5389023fc 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/cbd.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/cbd.h @@ -1,5 +1,5 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER768_AARCH64_CBD_H +#define PQCLEAN_KYBER768_AARCH64_CBD_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "poly.h" -#include #define poly_cbd_eta1 KYBER_NAMESPACE(poly_cbd_eta1) void poly_cbd_eta1(int16_t *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]); diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/feat.S b/Modules/PQClean/crypto_kem/kyber768/aarch64/feat.S deleted file mode 100644 index ce72974b0..000000000 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global PQCLEAN_KYBER768_AARCH64_f1600x2 -.global _PQCLEAN_KYBER768_AARCH64_f1600x2 -PQCLEAN_KYBER768_AARCH64_f1600x2: -_PQCLEAN_KYBER768_AARCH64_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/fips202x2.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/fips202x2.c deleted file mode 100644 index 77e1945d2..000000000 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/fips202x2.c +++ /dev/null @@ -1,684 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We choose - * CC0 1.0 Universal or the following MIT License for this file. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void PQCLEAN_KYBER768_AARCH64_f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_KYBER768_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/fips202x2.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/fips202x2.h deleted file mode 100644 index 14ceb7827..000000000 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/fips202x2.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include "params.h" -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -#define shake128x2_absorb KYBER_NAMESPACE(shake128x2_absorb) -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake128x2_squeezeblocks KYBER_NAMESPACE(shake128x2_squeezeblocks) -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake256x2_absorb KYBER_NAMESPACE(shake256x2_absorb) -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2_squeezeblocks KYBER_NAMESPACE(shake256x2_squeezeblocks) -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake128x2 KYBER_NAMESPACE(shake128x2) -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2 KYBER_NAMESPACE(shake256x2) -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#endif diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.c index 3e571c503..fef47388f 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -251,7 +252,6 @@ void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_S buflen = off + XOF_BLOCKBYTES; ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen); } - shake128_ctx_release(&c_state); } diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.h index f93487a37..ebcd26c2b 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/indcpa.h @@ -1,5 +1,5 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER768_AARCH64_INDCPA_H +#define PQCLEAN_KYBER768_AARCH64_INDCPA_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ +#include #include "params.h" #include "polyvec.h" -#include #define gen_matrix KYBER_NAMESPACE(gen_matrix) void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed); diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.c index 670a4c599..572b5e93a 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.c @@ -8,12 +8,14 @@ #include #include #include + +#include "api.h" #include "params.h" +#include "kem.h" #include "indcpa.h" #include "verify.h" #include "symmetric.h" #include "randombytes.h" -#include "kem.h" /************************************************* * Name: crypto_kem_keypair_derand diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.h index 3bedcd28c..ce50e37c5 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/kem.h @@ -1,5 +1,5 @@ -#ifndef KEM_H -#define KEM_H +#ifndef PQCLEAN_KYBER768_AARCH64_KEM_H +#define PQCLEAN_KYBER768_AARCH64_KEM_H /* * This file is licensed @@ -7,13 +7,8 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include - -#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES -#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES -#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES -#define CRYPTO_BYTES KYBER_SSBYTES +#include "params.h" #define CRYPTO_ALGNAME "Kyber768" @@ -33,3 +28,4 @@ int crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); int crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif + diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/macros.inc b/Modules/PQClean/crypto_kem/kyber768/aarch64/macros.inc index 2add309e0..5504405c1 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/macros.inc +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,11 +28,114 @@ * SOFTWARE. */ -#ifndef MACROS_S -#define MACROS_S - #include "macros_common.inc" +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + .macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H .endm @@ -52,16 +158,68 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + .macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H .endm -.macro qo_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm -.macro qo_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H .endm @@ -69,18 +227,176 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + .macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H .endm -.macro do_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H .endm -.macro do_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 .endm +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm -#endif +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== + +.macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 +.endm + +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm +// === 32-bit end ==== diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/macros_common.inc b/Modules/PQClean/crypto_kem/kyber768/aarch64/macros_common.inc index c1ac021cd..07568491d 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/macros_common.inc +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,31 +35,51 @@ .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -75,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -85,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -102,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -138,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -152,34 +310,340 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] mul \t4\wX, \b4\wX, \z4\nX[\h4] sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] mul \t5\wX, \b5\wX, \z5\nX[\h5] sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] mul \t6\wX, \b6\wX, \z6\nX[\h6] sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] mul \t7\wX, \b7\wX, \z7\nX[\h7] + ldr \d0, [\srcd_ptr, \memd0] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] add \a2\wX, \a2\wX, \t2\wX sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] add \a3\wX, \a3\wX, \t3\wX sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -221,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -231,7 +769,53 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D + +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX @@ -248,15 +832,77 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + ldr \c0, [\srcc_ptr, \memc0] mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] mul \t1\wX, \b1\wX, \h1\wX sub \b3\wX, \a3\wX, \t3\wX + ldr \c2, [\srcc_ptr, \memc2] sqrdmulh \b0\wX, \b0\wX, \l0\wX add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] sqrdmulh \b1\wX, \b1\wX, \l1\wX add \a3\wX, \a3\wX, \t3\wX @@ -269,53 +915,53 @@ .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -394,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_poly.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_poly.c index 03e7a3290..bd8241443 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_poly.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_poly.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -129,14 +130,14 @@ void neon_poly_invntt_tomont(int16_t r[KYBER_N]) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER768_AARCH64_asm_add_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER768_AARCH64__asm_add_reduce(int16_t *, const int16_t *); void neon_poly_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER768_AARCH64_asm_add_reduce(c, a); + PQCLEAN_KYBER768_AARCH64__asm_add_reduce(c, a); } -extern void PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); +extern void PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(int16_t *, const int16_t *, const int16_t *); void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], const int16_t b[KYBER_N]) { - PQCLEAN_KYBER768_AARCH64_asm_add_add_reduce(c, a, b); + PQCLEAN_KYBER768_AARCH64__asm_add_add_reduce(c, a, b); } /************************************************* @@ -150,7 +151,7 @@ void neon_poly_add_add_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N], cons * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(int16_t *, const int16_t *); +extern void PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(int16_t *, const int16_t *); void neon_poly_sub_reduce(int16_t c[KYBER_N], const int16_t a[KYBER_N]) { - PQCLEAN_KYBER768_AARCH64_asm_sub_reduce(c, a); + PQCLEAN_KYBER768_AARCH64__asm_sub_reduce(c, a); } diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_polyvec.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_polyvec.c index c05f59d66..8787fcde6 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_polyvec.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_polyvec.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -83,7 +84,7 @@ void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]) { * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]) { +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { // c = c + a; @@ -91,4 +92,3 @@ void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYB neon_poly_add_reduce(c[i], a[i]); } } - diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c index a5a2e7833..9a59724e7 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/neon_symmetric-shake.c @@ -5,8 +5,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -35,7 +36,7 @@ #include #include #include "params.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "symmetric.h" /************************************************* @@ -88,8 +89,8 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce1, uint8_t nonce2) { unsigned int i; - uint8_t extkey1[KYBER_SYMBYTES + 1 + 15]; - uint8_t extkey2[KYBER_SYMBYTES + 1 + 15]; + uint8_t extkey1[KYBER_SYMBYTES + 1]; + uint8_t extkey2[KYBER_SYMBYTES + 1]; for (i = 0; i < KYBER_SYMBYTES; i++) { extkey1[i] = key[i]; @@ -99,5 +100,5 @@ void neon_kyber_shake256_prf(uint8_t *out1, uint8_t *out2, extkey1[i] = nonce1; extkey2[i] = nonce2; - shake256x2(out1, out2, outlen, extkey1, extkey2, KYBER_SYMBYTES + 1); + shake256x2(out1, out2, outlen, extkey1, extkey2, sizeof(extkey1)); } diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.c index 8bca765e2..09583b73f 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.c @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,12 +28,35 @@ * SOFTWARE. */ -#include +#include #include "params.h" #include "ntt.h" -#include "reduce.h" #include "NTT_params.h" +const __attribute__ ((aligned (16)))int16_t asymmetric_const[8] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t constants[16] = { + Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, + invNQ1_R3modQ1_prime_half, + invNQ1_R3modQ1_doubleprime, + invNQ1_final_R3modQ1_prime_half, + invNQ1_final_R3modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { + 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 2914, 14036, 14036, -8682, -8682, -12156, -12156, 296, 296, 1426, 1426, -882, -882, -1235, -1235, 2845, 2845, -9942, -9942, -748, -748, 7943, 7943, 289, 289, -1010, -1010, -76, -76, 807, 807, 3258, 3258, 14125, 14125, -15483, -15483, 4449, 4449, 331, 331, 1435, 1435, -1573, -1573, 452, 452, 167, 167, 15592, 15592, 16113, 16113, 3691, 3691, 17, 17, 1584, 1584, 1637, 1637, 375, 375, -5591, -5591, -10148, -10148, 7117, 7117, -7678, -7678, -568, -568, -1031, -1031, 723, 723, -780, -780, 5739, 5739, -12717, -12717, -10247, -10247, -12196, -12196, 583, 583, -1292, -1292, -1041, -1041, -1239, -1239, -6693, -6693, -1073, -1073, 10828, 10828, 16192, 16192, -680, -680, -109, -109, 1100, 1100, 1645, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 13180, 5266, 5266, 14529, 14529, -4400, -4400, 1339, 1339, 535, 535, 1476, 1476, -447, -447, 11782, 11782, 14155, 14155, -10355, -10355, 15099, 15099, 1197, 1197, 1438, 1438, -1052, -1052, 1534, 1534, -10089, -10089, -4538, -4538, -12540, -12540, -9125, -9125, -1025, -1025, -461, -461, -1274, -1274, -927, -927, 13869, 13869, 10463, 10463, 7441, 7441, -12107, -12107, 1409, 1409, 1063, 1063, 756, 756, -1230, -1230, -6565, -6565, 3140, 3140, -11546, -11546, 5522, 5522, -667, -667, 319, 319, -1173, -1173, 561, 561, -472, -472, -5473, -5473, -3091, -3091, -8495, -8495, -48, -48, -556, -556, -314, -314, -863, -863, 2293, 2293, 7451, 7451, -2746, -2746, -7235, -7235, 233, 233, 757, 757, -279, -279, -735, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -2786, -9213, -9213, 551, 551, -4429, -4429, -283, -283, -936, -936, 56, 56, -450, -450, 6398, 6398, -6713, -6713, -8032, -8032, 14578, 14578, 650, 650, -682, -682, -816, -816, 1481, 1481, -13308, -13308, -7008, -7008, 6221, 6221, 6378, 6378, -1352, -1352, -712, -712, 632, 632, 648, 648, -16005, -16005, -5168, -5168, -14588, -14588, 11251, 11251, -1626, -1626, -525, -525, -1482, -1482, 1143, 1143, 16251, 16251, 10749, 10749, 9371, 9371, -11605, -11605, 1651, 1651, 1092, 1092, 952, 952, -1179, -1179, -5315, -5315, 3967, 3967, 14381, 14381, -5453, -5453, -540, -540, 403, 403, 1461, 1461, -554, -554, -15159, -15159, 10099, 10099, -6319, -6319, 8721, 8721, -1540, -1540, 1026, 1026, -642, -642, 886, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -10719, -13338, -13338, 13121, 13121, 8081, 8081, -1089, -1089, -1355, -1355, 1333, 1333, 821, 821, -4567, -4567, -8416, -8416, 12993, 12993, 12078, 12078, -464, -464, -855, -855, 1320, 1320, 1227, 1227, 325, 325, -2156, -2156, -13918, -13918, 8957, 8957, 33, 33, -219, -219, -1414, -1414, 910, 910, 9243, 9243, -15818, -15818, 7215, 7215, -11999, -11999, 939, 939, -1607, -1607, 733, 733, -1219, -1219, -10050, -10050, 11930, 11930, -9764, -9764, -3878, -3878, -1021, -1021, 1212, 1212, -992, -992, -394, -394, -8780, -8780, -14322, -14322, 2638, 2638, 8711, 8711, -892, -892, -1455, -1455, 268, 268, 885, 885, -9262, -9262, 10129, 10129, 6309, 6309, -11566, -11566, -941, -941, 1029, 1029, 641, 641, -1175, -1175 +}; + +const __attribute__ ((aligned (16)))int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { + 167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 17, -17, -568, 568, 583, -583, -680, 680, 16113, -16113, 7117, -7117, -10247, 10247, 10828, -10828, 1637, -1637, 723, -723, -1041, 1041, 1100, -1100, 13869, -13869, -6565, 6565, -472, 472, 2293, -2293, 1409, -1409, -667, 667, -48, 48, 233, -233, 7441, -7441, -11546, 11546, -3091, 3091, -2746, 2746, 756, -756, -1173, 1173, -314, 314, -279, 279, -16005, 16005, 16251, -16251, -5315, 5315, -15159, 15159, -1626, 1626, 1651, -1651, -540, 540, -1540, 1540, -14588, 14588, 9371, -9371, 14381, -14381, -6319, 6319, -1482, 1482, 952, -952, 1461, -1461, -642, 642, 9243, -9243, -10050, 10050, -8780, 8780, -9262, 9262, 939, -939, -1021, 1021, -892, 892, -941, 941, 7215, -7215, -9764, 9764, 2638, -2638, 6309, -6309, 733, -733, -992, 992, 268, -268, 641, -641, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073, 1584, -1584, -1031, 1031, -1292, 1292, -109, 109, 3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 375, -375, -780, 780, -1239, 1239, 1645, -1645, 10463, -10463, 3140, -3140, -5473, 5473, 7451, -7451, 1063, -1063, 319, -319, -556, 556, 757, -757, -12107, 12107, 5522, -5522, -8495, 8495, -7235, 7235, -1230, 1230, 561, -561, -863, 863, -735, 735, -5168, 5168, 10749, -10749, 3967, -3967, 10099, -10099, -525, 525, 1092, -1092, 403, -403, 1026, -1026, 11251, -11251, -11605, 11605, -5453, 5453, 8721, -8721, 1143, -1143, -1179, 1179, -554, 554, 886, -886, -15818, 15818, 11930, -11930, -14322, 14322, 10129, -10129, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -11999, 11999, -3878, 3878, 8711, -8711, -11566, 11566, -1219, 1219, -394, 394, 885, -885, -1175, 1175 +}; + +const __attribute__ ((aligned (16)))int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1] = { + 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -8081, -13121, -13121, 13338, 13338, 10719, 10719, -821, -821, -1333, -1333, 1355, 1355, 1089, 1089, -8957, -8957, 13918, 13918, 2156, 2156, -325, -325, -910, -910, 1414, 1414, 219, 219, -33, -33, -12078, -12078, -12993, -12993, 8416, 8416, 4567, 4567, -1227, -1227, -1320, -1320, 855, 855, 464, 464, 11566, 11566, -6309, -6309, -10129, -10129, 9262, 9262, 1175, 1175, -641, -641, -1029, -1029, 941, 941, -8711, -8711, -2638, -2638, 14322, 14322, 8780, 8780, -885, -885, -268, -268, 1455, 1455, 892, 892, 3878, 3878, 9764, 9764, -11930, -11930, 10050, 10050, 394, 394, 992, 992, -1212, -1212, 1021, 1021, 11999, 11999, -7215, -7215, 15818, 15818, -9243, -9243, 1219, 1219, -733, -733, 1607, 1607, -939, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 4429, -551, -551, 9213, 9213, 2786, 2786, 450, 450, -56, -56, 936, 936, 283, 283, -6378, -6378, -6221, -6221, 7008, 7008, 13308, 13308, -648, -648, -632, -632, 712, 712, 1352, 1352, -14578, -14578, 8032, 8032, 6713, 6713, -6398, -6398, -1481, -1481, 816, 816, 682, 682, -650, -650, -8721, -8721, 6319, 6319, -10099, -10099, 15159, 15159, -886, -886, 642, 642, -1026, -1026, 1540, 1540, 5453, 5453, -14381, -14381, -3967, -3967, 5315, 5315, 554, 554, -1461, -1461, -403, -403, 540, 540, 11605, 11605, -9371, -9371, -10749, -10749, -16251, -16251, 1179, 1179, -952, -952, -1092, -1092, -1651, -1651, -11251, -11251, 14588, 14588, 5168, 5168, 16005, 16005, -1143, -1143, 1482, 1482, 525, 525, 1626, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 4400, -14529, -14529, -5266, -5266, -13180, -13180, 447, 447, -1476, -1476, -535, -535, -1339, -1339, 9125, 9125, 12540, 12540, 4538, 4538, 10089, 10089, 927, 927, 1274, 1274, 461, 461, 1025, 1025, -15099, -15099, 10355, 10355, -14155, -14155, -11782, -11782, -1534, -1534, 1052, 1052, -1438, -1438, -1197, -1197, 7235, 7235, 2746, 2746, -7451, -7451, -2293, -2293, 735, 735, 279, 279, -757, -757, -233, -233, 8495, 8495, 3091, 3091, 5473, 5473, 472, 472, 863, 863, 314, 314, 556, 556, 48, 48, -5522, -5522, 11546, 11546, -3140, -3140, 6565, 6565, -561, -561, 1173, 1173, -319, -319, 667, 667, 12107, 12107, -7441, -7441, -10463, -10463, -13869, -13869, 1230, 1230, -756, -756, -1063, -1063, -1409, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 12156, 8682, 8682, -14036, -14036, -2914, -2914, 1235, 1235, 882, 882, -1426, -1426, -296, -296, -4449, -4449, 15483, 15483, -14125, -14125, -3258, -3258, -452, -452, 1573, 1573, -1435, -1435, -331, -331, -7943, -7943, 748, 748, 9942, 9942, -2845, -2845, -807, -807, 76, 76, 1010, 1010, -289, -289, -16192, -16192, -10828, -10828, 1073, 1073, 6693, 6693, -1645, -1645, -1100, -1100, 109, 109, 680, 680, 12196, 12196, 10247, 10247, 12717, 12717, -5739, -5739, 1239, 1239, 1041, 1041, 1292, 1292, -583, -583, 7678, 7678, -7117, -7117, 10148, 10148, 5591, 5591, 780, 780, -723, -723, 1031, 1031, 568, 568, -3691, -3691, -16113, -16113, -15592, -15592, -167, -167, -375, -375, -1637, -1637, -1584, -1584, -17, -17 +}; + /************************************************* * Name: ntt * diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.h index 90e1c61df..59945023a 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/ntt.h @@ -1,12 +1,15 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER768_AARCH64_NTT_H +#define PQCLEAN_KYBER768_AARCH64_NTT_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,12 +35,11 @@ #include "NTT_params.h" -extern const int16_t zetas[128]; - -#define ntt KYBER_NAMESPACE(ntt) -void ntt(int16_t r[256]); -#define invntt KYBER_NAMESPACE(invntt) -void invntt(int16_t r[256]); +#define asymmetric_const KYBER_NAMESPACE(asymmetric_const) +#define constants KYBER_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define pre_asymmetric_table_Q1_extended KYBER_NAMESPACE(pre_asymmetric_table_Q1_extended) +#define streamlined_inv_GS_negacyclic_table_Q1_jump_extended KYBER_NAMESPACE(streamlined_inv_GS_negacyclic_table_Q1_jump_extended) extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(int16_t *, const int16_t *, const int16_t *); extern void PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(int16_t *, const int16_t *, const int16_t *); @@ -49,38 +51,33 @@ extern void PQCLEAN_KYBER768_AARCH64__asm_point_mul_extended(int16_t *, const in extern void PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); extern void PQCLEAN_KYBER768_AARCH64__asm_asymmetric_mul_montgomery(const int16_t *, const int16_t *, const int16_t *, const int16_t *, int16_t *); -static const int16_t asymmetric_const[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, R3modQ1_prime_half, R3modQ1_doubleprime -}; +extern +const int16_t asymmetric_const[8]; +extern +const int16_t constants[16]; -#define NTT(in) { \ - PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int16_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; -#define iNTT(in) { \ - PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +extern +const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N]; -static const int16_t constants[16] = { - Q1, Q1prime2, RmodQ1, RmodQ1Q1prime, roundRdivQ1, - invNQ1_R3modQ1_prime_half, - invNQ1_R3modQ1_doubleprime, - invNQ1_final_R3modQ1_prime_half, - invNQ1_final_R3modQ1_doubleprime -}; +extern +const int16_t streamlined_inv_GS_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4) + NTT_N) << 1]; -static const int16_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, -15749, -1600, -7373, -749, -394, -40, -6762, -687, 6201, 630, -14095, -1432, 8347, 848, 10453, 1062, -13879, -1410, 1900, 193, 7845, 797, -5345, -543, -679, -69, 5601, 569, -15582, -1583, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2914, 296, 2914, 296, 14036, 1426, 14036, 1426, -8682, -882, -8682, -882, -12156, -1235, -12156, -1235, 2845, 289, 2845, 289, -9942, -1010, -9942, -1010, -748, -76, -748, -76, 7943, 807, 7943, 807, 3258, 331, 3258, 331, 14125, 1435, 14125, 1435, -15483, -1573, -15483, -1573, 4449, 452, 4449, 452, 167, 17, 167, 17, 15592, 1584, 15592, 1584, 16113, 1637, 16113, 1637, 3691, 375, 3691, 375, -5591, -568, -5591, -568, -10148, -1031, -10148, -1031, 7117, 723, 7117, 723, -7678, -780, -7678, -780, 5739, 583, 5739, 583, -12717, -1292, -12717, -1292, -10247, -1041, -10247, -1041, -12196, -1239, -12196, -1239, -6693, -680, -6693, -680, -1073, -109, -1073, -109, 10828, 1100, 10828, 1100, 16192, 1645, 16192, 1645, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13180, 1339, 13180, 1339, 5266, 535, 5266, 535, 14529, 1476, 14529, 1476, -4400, -447, -4400, -447, 11782, 1197, 11782, 1197, 14155, 1438, 14155, 1438, -10355, -1052, -10355, -1052, 15099, 1534, 15099, 1534, -10089, -1025, -10089, -1025, -4538, -461, -4538, -461, -12540, -1274, -12540, -1274, -9125, -927, -9125, -927, 13869, 1409, 13869, 1409, 10463, 1063, 10463, 1063, 7441, 756, 7441, 756, -12107, -1230, -12107, -1230, -6565, -667, -6565, -667, 3140, 319, 3140, 319, -11546, -1173, -11546, -1173, 5522, 561, 5522, 561, -472, -48, -472, -48, -5473, -556, -5473, -556, -3091, -314, -3091, -314, -8495, -863, -8495, -863, 2293, 233, 2293, 233, 7451, 757, 7451, 757, -2746, -279, -2746, -279, -7235, -735, -7235, -735, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2786, -283, -2786, -283, -9213, -936, -9213, -936, 551, 56, 551, 56, -4429, -450, -4429, -450, 6398, 650, 6398, 650, -6713, -682, -6713, -682, -8032, -816, -8032, -816, 14578, 1481, 14578, 1481, -13308, -1352, -13308, -1352, -7008, -712, -7008, -712, 6221, 632, 6221, 632, 6378, 648, 6378, 648, -16005, -1626, -16005, -1626, -5168, -525, -5168, -525, -14588, -1482, -14588, -1482, 11251, 1143, 11251, 1143, 16251, 1651, 16251, 1651, 10749, 1092, 10749, 1092, 9371, 952, 9371, 952, -11605, -1179, -11605, -1179, -5315, -540, -5315, -540, 3967, 403, 3967, 403, 14381, 1461, 14381, 1461, -5453, -554, -5453, -554, -15159, -1540, -15159, -1540, 10099, 1026, 10099, 1026, -6319, -642, -6319, -642, 8721, 886, 8721, 886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -10719, -1089, -10719, -1089, -13338, -1355, -13338, -1355, 13121, 1333, 13121, 1333, 8081, 821, 8081, 821, -4567, -464, -4567, -464, -8416, -855, -8416, -855, 12993, 1320, 12993, 1320, 12078, 1227, 12078, 1227, 325, 33, 325, 33, -2156, -219, -2156, -219, -13918, -1414, -13918, -1414, 8957, 910, 8957, 910, 9243, 939, 9243, 939, -15818, -1607, -15818, -1607, 7215, 733, 7215, 733, -11999, -1219, -11999, -1219, -10050, -1021, -10050, -1021, 11930, 1212, 11930, 1212, -9764, -992, -9764, -992, -3878, -394, -3878, -394, -8780, -892, -8780, -892, -14322, -1455, -14322, -1455, 2638, 268, 2638, 268, 8711, 885, 8711, 885, -9262, -941, -9262, -941, 10129, 1029, 10129, 1029, 6309, 641, 6309, 641, -11566, -1175, -11566, -1175, 0, 0 -}; +#define NTT(in) do { \ + PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER768_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) -static const int16_t pre_asymmetric_table_Q1_extended[ARRAY_N] = { - 167, 17, -167, -17, -5591, -568, 5591, 568, 5739, 583, -5739, -583, -6693, -680, 6693, 680, 16113, 1637, -16113, -1637, 7117, 723, -7117, -723, -10247, -1041, 10247, 1041, 10828, 1100, -10828, -1100, 13869, 1409, -13869, -1409, -6565, -667, 6565, 667, -472, -48, 472, 48, 2293, 233, -2293, -233, 7441, 756, -7441, -756, -11546, -1173, 11546, 1173, -3091, -314, 3091, 314, -2746, -279, 2746, 279, -16005, -1626, 16005, 1626, 16251, 1651, -16251, -1651, -5315, -540, 5315, 540, -15159, -1540, 15159, 1540, -14588, -1482, 14588, 1482, 9371, 952, -9371, -952, 14381, 1461, -14381, -1461, -6319, -642, 6319, 642, 9243, 939, -9243, -939, -10050, -1021, 10050, 1021, -8780, -892, 8780, 892, -9262, -941, 9262, 941, 7215, 733, -7215, -733, -9764, -992, 9764, 992, 2638, 268, -2638, -268, 6309, 641, -6309, -641, 15592, 1584, -15592, -1584, -10148, -1031, 10148, 1031, -12717, -1292, 12717, 1292, -1073, -109, 1073, 109, 3691, 375, -3691, -375, -7678, -780, 7678, 780, -12196, -1239, 12196, 1239, 16192, 1645, -16192, -1645, 10463, 1063, -10463, -1063, 3140, 319, -3140, -319, -5473, -556, 5473, 556, 7451, 757, -7451, -757, -12107, -1230, 12107, 1230, 5522, 561, -5522, -561, -8495, -863, 8495, 863, -7235, -735, 7235, 735, -5168, -525, 5168, 525, 10749, 1092, -10749, -1092, 3967, 403, -3967, -403, 10099, 1026, -10099, -1026, 11251, 1143, -11251, -1143, -11605, -1179, 11605, 1179, -5453, -554, 5453, 554, 8721, 886, -8721, -886, -15818, -1607, 15818, 1607, 11930, 1212, -11930, -1212, -14322, -1455, 14322, 1455, 10129, 1029, -10129, -1029, -11999, -1219, 11999, 1219, -3878, -394, 3878, 394, 8711, 885, -8711, -885, -11566, -1175, 11566, 1175 -}; +#define iNTT(in) do { \ + PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_bot(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_KYBER768_AARCH64__asm_intt_SIMD_top(in, streamlined_inv_GS_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) -static const int16_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4) + NTT_N) << 1] = { - 0, 0, 15749, 1600, 394, 40, 7373, 749, -8347, -848, 14095, 1432, -6201, -630, 6762, 687, 15582, 1583, -5601, -569, 679, 69, 5345, 543, -7845, -797, -1900, -193, 13879, 1410, -10453, -1062, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8081, -821, -8081, -821, -13121, -1333, -13121, -1333, 13338, 1355, 13338, 1355, 10719, 1089, 10719, 1089, -8957, -910, -8957, -910, 13918, 1414, 13918, 1414, 2156, 219, 2156, 219, -325, -33, -325, -33, -12078, -1227, -12078, -1227, -12993, -1320, -12993, -1320, 8416, 855, 8416, 855, 4567, 464, 4567, 464, 11566, 1175, 11566, 1175, -6309, -641, -6309, -641, -10129, -1029, -10129, -1029, 9262, 941, 9262, 941, -8711, -885, -8711, -885, -2638, -268, -2638, -268, 14322, 1455, 14322, 1455, 8780, 892, 8780, 892, 3878, 394, 3878, 394, 9764, 992, 9764, 992, -11930, -1212, -11930, -1212, 10050, 1021, 10050, 1021, 11999, 1219, 11999, 1219, -7215, -733, -7215, -733, 15818, 1607, 15818, 1607, -9243, -939, -9243, -939, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4429, 450, 4429, 450, -551, -56, -551, -56, 9213, 936, 9213, 936, 2786, 283, 2786, 283, -6378, -648, -6378, -648, -6221, -632, -6221, -632, 7008, 712, 7008, 712, 13308, 1352, 13308, 1352, -14578, -1481, -14578, -1481, 8032, 816, 8032, 816, 6713, 682, 6713, 682, -6398, -650, -6398, -650, -8721, -886, -8721, -886, 6319, 642, 6319, 642, -10099, -1026, -10099, -1026, 15159, 1540, 15159, 1540, 5453, 554, 5453, 554, -14381, -1461, -14381, -1461, -3967, -403, -3967, -403, 5315, 540, 5315, 540, 11605, 1179, 11605, 1179, -9371, -952, -9371, -952, -10749, -1092, -10749, -1092, -16251, -1651, -16251, -1651, -11251, -1143, -11251, -1143, 14588, 1482, 14588, 1482, 5168, 525, 5168, 525, 16005, 1626, 16005, 1626, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4400, 447, 4400, 447, -14529, -1476, -14529, -1476, -5266, -535, -5266, -535, -13180, -1339, -13180, -1339, 9125, 927, 9125, 927, 12540, 1274, 12540, 1274, 4538, 461, 4538, 461, 10089, 1025, 10089, 1025, -15099, -1534, -15099, -1534, 10355, 1052, 10355, 1052, -14155, -1438, -14155, -1438, -11782, -1197, -11782, -1197, 7235, 735, 7235, 735, 2746, 279, 2746, 279, -7451, -757, -7451, -757, -2293, -233, -2293, -233, 8495, 863, 8495, 863, 3091, 314, 3091, 314, 5473, 556, 5473, 556, 472, 48, 472, 48, -5522, -561, -5522, -561, 11546, 1173, 11546, 1173, -3140, -319, -3140, -319, 6565, 667, 6565, 667, 12107, 1230, 12107, 1230, -7441, -756, -7441, -756, -10463, -1063, -10463, -1063, -13869, -1409, -13869, -1409, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12156, 1235, 12156, 1235, 8682, 882, 8682, 882, -14036, -1426, -14036, -1426, -2914, -296, -2914, -296, -4449, -452, -4449, -452, 15483, 1573, 15483, 1573, -14125, -1435, -14125, -1435, -3258, -331, -3258, -331, -7943, -807, -7943, -807, 748, 76, 748, 76, 9942, 1010, 9942, 1010, -2845, -289, -2845, -289, -16192, -1645, -16192, -1645, -10828, -1100, -10828, -1100, 1073, 109, 1073, 109, 6693, 680, 6693, 680, 12196, 1239, 12196, 1239, 10247, 1041, 10247, 1041, 12717, 1292, 12717, 1292, -5739, -583, -5739, -583, 7678, 780, 7678, 780, -7117, -723, -7117, -723, 10148, 1031, 10148, 1031, 5591, 568, 5591, 568, -3691, -375, -3691, -375, -16113, -1637, -16113, -1637, -15592, -1584, -15592, -1584, -167, -17, -167, -17, 0, 0 -}; +#define ntt KYBER_NAMESPACE(ntt) +void ntt(int16_t r[256]); +#define invntt KYBER_NAMESPACE(invntt) +void invntt(int16_t r[256]); #endif diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/params.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/params.h index 66151ac2a..33b314e48 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/params.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/params.h @@ -9,6 +9,7 @@ #define KYBER_NAMESPACE(s) PQCLEAN_KYBER768_AARCH64_##s +/* Don't change parameters below this line */ #define KYBER_N 256 #define KYBER_Q 3329 diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.c index 7d5dbe66e..3cb9ecc48 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/kyber/blob/master/ref * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.h index 83c35067e..2af01f78a 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER768_AARCH64_POLY_H +#define PQCLEAN_KYBER768_AARCH64_POLY_H /* * This file is licensed @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial @@ -30,7 +30,7 @@ void poly_tobytes(uint8_t r[KYBER_POLYBYTES], const int16_t a[KYBER_N]); #define poly_frommsg KYBER_NAMESPACE(poly_frommsg) void poly_frommsg(int16_t r[KYBER_N], const uint8_t msg[KYBER_INDCPA_MSGBYTES]); #define poly_tomsg KYBER_NAMESPACE(poly_tomsg) -void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t a[KYBER_N]); +void poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const int16_t r[KYBER_N]); // NEON diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.c b/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.c index d495809ec..8930c9563 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.c +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.c @@ -19,7 +19,7 @@ * (needs space for KYBER_POLYVECCOMPRESSEDBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i, j, k; uint16_t t[4]; @@ -79,7 +79,7 @@ void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYV * (needs space for KYBER_POLYVECBYTES) * - const polyvec *a: pointer to input vector of polynomials **************************************************/ -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]) { +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]) { unsigned int i; for (i = 0; i < KYBER_K; i++) { poly_tobytes(r + i * KYBER_POLYBYTES, a[i]); diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.h index 827610d63..97dcf23ce 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER768_AARCH64_POLYVEC_H +#define PQCLEAN_KYBER768_AARCH64_POLYVEC_H /* * This file was originally licensed @@ -7,8 +7,9 @@ * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or * public domain at https://github.com/cothan/kyber/blob/master/neon * - * We choose + * We offer * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -34,21 +35,21 @@ * SOFTWARE. */ +#include #include "params.h" #include "poly.h" -#include typedef struct { poly vec[KYBER_K]; } polyvec; #define polyvec_compress KYBER_NAMESPACE(polyvec_compress) -void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_decompress KYBER_NAMESPACE(polyvec_decompress) void polyvec_decompress(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); #define polyvec_tobytes KYBER_NAMESPACE(polyvec_tobytes) -void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], int16_t a[KYBER_K][KYBER_N]); +void polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const int16_t a[KYBER_K][KYBER_N]); #define polyvec_frombytes KYBER_NAMESPACE(polyvec_frombytes) void polyvec_frombytes(int16_t r[KYBER_K][KYBER_N], const uint8_t a[KYBER_POLYVECBYTES]); @@ -61,6 +62,6 @@ void neon_polyvec_ntt(int16_t r[KYBER_K][KYBER_N]); void neon_polyvec_invntt_to_mont(int16_t r[KYBER_K][KYBER_N]); #define neon_polyvec_add_reduce KYBER_NAMESPACE(polyvec_add_reduce) -void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], int16_t a[KYBER_K][KYBER_N]); +void neon_polyvec_add_reduce(int16_t c[KYBER_K][KYBER_N], const int16_t a[KYBER_K][KYBER_N]); #endif diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/reduce.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/reduce.h index 7d0f8e3bc..e64226f6f 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/reduce.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER768_AARCH64_REDUCE_H +#define PQCLEAN_KYBER768_AARCH64_REDUCE_H /* * This file is licensed @@ -7,8 +7,8 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include +#include "params.h" #define MONT (-1044) // 2^16 mod q #define QINV (-3327) // q^-1 mod 2^16 diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/rejsample.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/rejsample.h index ee9ae85c8..876742091 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/rejsample.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/rejsample.h @@ -1,5 +1,5 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H +#ifndef PQCLEAN_KYBER768_AARCH64_REJSAMPLE_H +#define PQCLEAN_KYBER768_AARCH64_REJSAMPLE_H /* * This file is licensed @@ -8,8 +8,8 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include +#include "params.h" #define neon_rej_uniform KYBER_NAMESPACE(_neon_rej_uniform) unsigned int neon_rej_uniform(int16_t *r, diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/symmetric.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/symmetric.h index cb9ea69e8..c88aac1aa 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/symmetric.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/symmetric.h @@ -1,5 +1,5 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_KYBER768_AARCH64_SYMMETRIC_H +#define PQCLEAN_KYBER768_AARCH64_SYMMETRIC_H /* * This file is licensed @@ -8,9 +8,9 @@ * public domain at https://github.com/cothan/kyber/blob/master/neon */ -#include "params.h" #include #include +#include "params.h" #include "fips202.h" @@ -37,7 +37,7 @@ void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SY #define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) // NEON Definition -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" typedef keccakx2_state neon_xof_state; diff --git a/Modules/PQClean/crypto_kem/kyber768/aarch64/verify.h b/Modules/PQClean/crypto_kem/kyber768/aarch64/verify.h index 3b9eca9f6..a52767b3e 100644 --- a/Modules/PQClean/crypto_kem/kyber768/aarch64/verify.h +++ b/Modules/PQClean/crypto_kem/kyber768/aarch64/verify.h @@ -1,5 +1,5 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER768_AARCH64_VERIFY_H +#define PQCLEAN_KYBER768_AARCH64_VERIFY_H /* * This file is licensed @@ -7,9 +7,9 @@ * public domain at https://github.com/pq-crystals/kyber/tree/master/ref */ -#include "params.h" #include #include +#include "params.h" #define verify KYBER_NAMESPACE(verify) int verify(const uint8_t *a, const uint8_t *b, size_t len); diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/LICENSE b/Modules/PQClean/crypto_sign/dilithium2/aarch64/LICENSE index 0e259d42c..093b0a7db 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/LICENSE +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/Makefile b/Modules/PQClean/crypto_sign/dilithium2/aarch64/Makefile index 2e9510385..6f5f550dc 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/Makefile +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/Makefile @@ -1,11 +1,14 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium2_aarch64.a -HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h -OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o feat.o +HEADERS=api.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) all: $(LIB) @@ -15,9 +18,14 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) $(HEADERS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) $(RM) $(LIB) + + diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/NTT_params.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/NTT_params.h index 582c16ed5..72ce624d7 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/NTT_params.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/NTT_params.h @@ -1,8 +1,10 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_NTT_PARAMS_H +#define PQCLEAN_DILITHIUM2_AARCH64_NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_NTT.S b/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_NTT.S index bf7c70cf0..ad121e6f0 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_NTT.S +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,261 +30,413 @@ #include "macros.inc" -.align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top -PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top: -_PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top: - - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 - - ldr Q, [x2] - - mov table, x1 - - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 - - add src11, src0, #704 - add src12, src0, #768 +#include "params.h" - add src13, src0, #832 - add src14, src0, #896 +.align 2 +.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top +PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top: +_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top: - add src15, src0, #960 + push_simd + Q .req w8 + src .req x0 + counter .req x11 - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 + ldr Q, [x2] mov v20.S[0], Q - ld1 { v1.4S}, [ src1] - ld1 { v3.4S}, [ src3] - ld1 { v5.4S}, [ src5] - ld1 { v7.4S}, [ src7] - ld1 { v9.4S}, [ src9] - ld1 {v11.4S}, [src11] - ld1 {v13.4S}, [src13] - ld1 {v15.4S}, [src15] - - ld1 { v0.4S}, [ src0] - ld1 { v2.4S}, [ src2] - ld1 { v4.4S}, [ src4] - ld1 { v6.4S}, [ src6] - ld1 { v8.4S}, [ src8] - ld1 {v10.4S}, [src10] - ld1 {v12.4S}, [src12] - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + ldr q9, [src, #9*64] + ldr q11, [src, #11*64] + ldr q13, [src, #13*64] + ldr q15, [src, #15*64] + + qq_butterfly_topl \ + v9, v11, v13, v15, v16, v17, v18, v19, v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64 + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) mov counter, #3 _ntt_top_loop: - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mixssl \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + v9, v11, v13, v15, v16, v17, v18, v19, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64, \ + src, \ + q1, q3, q5, q7, \ + #(16+1*64), #(16+3*64), #(16+5*64), #(16+7*64) + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #(16+8*64), #(16+10*64), #(16+12*64), #(16+14*64), \ + src, \ + q0, q2, q4, q6, \ + #(16+0*64), #(16+2*64), #(16+4*64), #(16+6*64) + + add src, src, #16 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) sub counter, counter, #1 cbnz counter, _ntt_top_loop - st1 { v1.4S}, [ src1], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v7.4S}, [ src7], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v11.4S}, [src11], #16 - st1 {v13.4S}, [src13], #16 - st1 {v15.4S}, [src15], #16 - - st1 { v0.4S}, [ src0], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v8.4S}, [ src8], #16 - st1 {v10.4S}, [src10], #16 - st1 {v12.4S}, [src12], #16 - st1 {v14.4S}, [src14], #16 + qq_butterfly_botss \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter - pop_all + pop_simd - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot -PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot: -_PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot: - - push_all - Q .req w20 - src0 .req x0 - des0 .req x1 - src1 .req x2 - des1 .req x3 - table0 .req x28 - table1 .req x27 - counter .req x19 +.global PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot +PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot: +_PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot: + + push_simd + Q .req w8 + src .req x0 + table0 .req x9 + table1 .req x10 + counter .req x11 ldr Q, [x2] add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + ldr q0, [src, #0*16] + ldr q1, [src, #1*16] + ldr q2, [src, #2*16] + ldr q3, [src, #3*16] + + ldr q4, [table0, #0*16] + ldr q5, [table0, #1*16] + ldr q20, [table1, #0*16] + ldr q21, [table1, #1*16] + + dq_butterfly_topl4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 + + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + - add des0, src0, #0 - add des1, src0, #512 + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - mov counter, #8 + str q0, [src, #0*16] + str q2, [src, #2*16] + + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 + + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + mov counter, #3 _ntt_bot_loop: - ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld1 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + trn_4x4_l4 v12, v13, v14, v15, v8, v9, v10, v11, src, q0, q1, q2, q3, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + str q12, [src, #0*16] + str q13, [src, #1*16] - mov v4.S[0], Q + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - dq_butterfly_top v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 - dq_butterfly_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + str q14, [src, #2*16] + str q15, [src, #3*16] + + + add src, src, #64 + + trn_4x4_l4 v28, v29, v30, v31, v24, v25, v26, v27, src, q16, q17, q18, q19, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q28, q29, q30, q31, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + add table0, table0, #128 + add table1, table1, #128 + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 - dq_butterfly_vec_top v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7 - dq_butterfly_vec_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 - dq_butterfly_vec_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) + + str q0, [src, #0*16] + str q2, [src, #2*16] + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 - st4 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st4 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - .unreq Q - .unreq src0 - .unreq des0 - .unreq src1 - .unreq des1 - .unreq table0 - .unreq table1 - .unreq counter - pop_all + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - br lr + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + trn_4x4_s4 v28, v29, v30, v31, v16, v17, v18, v19, src, q12, q13, q14, q15, #0*16, #1*16, #2*16, #3*16 + str q28, [src, #(512+0*16)] + str q29, [src, #(512+1*16)] + str q30, [src, #(512+2*16)] + str q31, [src, #(512+3*16)] + add src, src, #64 + .unreq Q + .unreq src + .unreq table0 + .unreq table1 + .unreq counter + pop_simd + ret diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_iNTT.S b/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_iNTT.S index cb20745cb..b37abaebf 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_iNTT.S +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top -PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: -_PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top +PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top: +_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top: push_all Q .req w20 @@ -41,23 +44,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: invNR2dp .req w25 invNWR2ph .req w26 invNWR2dp .req w27 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 counter .req x19 ldr Q, [x2, #0] @@ -69,77 +56,63 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: ldr invNWR2ph, [x2, #24] ldr invNWR2dp, [x2, #28] - mov table, x1 + ldr q20, [x1, #0*16] + ldr q21, [x1, #1*16] + ldr q22, [x1, #2*16] + ldr q23, [x1, #3*16] + ldr q24, [x1, #4*16] + ldr q25, [x1, #5*16] + ldr q26, [x1, #6*16] + ldr q27, [x1, #7*16] - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 + mov v20.S[0], Q - add src11, src0, #704 - add src12, src0, #768 + ldr q0, [src, # 0*64] + ldr q1, [src, # 1*64] - add src13, src0, #832 - add src14, src0, #896 + ldr q2, [src, # 2*64] + ldr q3, [src, # 3*64] - add src15, src0, #960 + ldr q4, [src, # 4*64] + ldr q5, [src, # 5*64] - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ldr q6, [src, # 6*64] + ldr q7, [src, # 7*64] - mov v20.S[0], Q + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 - ld1 { v0.4S}, [ src0] - ld1 { v1.4S}, [ src1] - ld1 { v2.4S}, [ src2] - ld1 { v3.4S}, [ src3] - ld1 { v4.4S}, [ src4] - ld1 { v5.4S}, [ src5] - ld1 { v6.4S}, [ src6] - ld1 { v7.4S}, [ src7] - - ld1 { v8.4S}, [ src8] - ld1 { v9.4S}, [ src9] - ld1 {v10.4S}, [src10] - ld1 {v11.4S}, [src11] - ld1 {v12.4S}, [src12] - ld1 {v13.4S}, [src13] - ld1 {v14.4S}, [src14] - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + mov counter, #3 + _intt_top_loop: + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -153,77 +126,99 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: sub v17.4S, v17.4S, v19.4S mla v0.4S, v16.4S, v29.4S - mla v1.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v2.4S + mla v1.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v3.4S + + str q0, [src, #0*64] cmge v16.4S, v2.4S, v30.4S + ldr q0, [src, #(16 + 0*64)] + str q1, [src, #1*64] cmge v17.4S, v3.4S, v30.4S + ldr q1, [src, #(16 + 1*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v2.4S, v16.4S, v29.4S - mla v3.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v4.4S + mla v3.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v5.4S + + str q2, [src, #2*64] cmge v16.4S, v4.4S, v30.4S + ldr q2, [src, #(16 + 2*64)] + str q3, [src, #3*64] cmge v17.4S, v5.4S, v30.4S + ldr q3, [src, #(16 + 3*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v4.4S, v16.4S, v29.4S - mla v5.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v6.4S + mla v5.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v7.4S + + str q4, [src, #4*64] cmge v16.4S, v6.4S, v30.4S + ldr q4, [src, #(16 + 4*64)] + str q5, [src, #5*64] cmge v17.4S, v7.4S, v30.4S + ldr q5, [src, #(16 + 5*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v6.4S, v16.4S, v29.4S - mla v7.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v8.4S + mla v7.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v9.4S + + str q6, [src, #6*64] cmge v16.4S, v8.4S, v30.4S + ldr q6, [src, #(16 + 6*64)] + str q7, [src, #7*64] cmge v17.4S, v9.4S, v30.4S + ldr q7, [src, #(16 + 7*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v8.4S, v16.4S, v29.4S - mla v9.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v10.4S + mla v9.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v11.4S + + str q8, [src, #8*64] cmge v16.4S, v10.4S, v30.4S + str q9, [src, #9*64] cmge v17.4S, v11.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v10.4S, v16.4S, v29.4S - mla v11.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v12.4S + mla v11.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v13.4S + + str q10, [src, #10*64] cmge v16.4S, v12.4S, v30.4S + str q11, [src, #11*64] cmge v17.4S, v13.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v12.4S, v16.4S, v29.4S - mla v13.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v14.4S + mla v13.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v15.4S + + str q12, [src, #12*64] cmge v16.4S, v14.4S, v30.4S + str q13, [src, #13*64] cmge v17.4S, v15.4S, v30.4S sub v16.4S, v16.4S, v18.4S @@ -232,66 +227,45 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - mov counter, #3 - _intt_top_loop: - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 + + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 + + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -307,6 +281,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v0.4S, v16.4S, v29.4S mla v1.4S, v17.4S, v29.4S + str q0, [src, #0*64] + str q1, [src, #1*64] + cmge v18.4S, v31.4S, v2.4S cmge v19.4S, v31.4S, v3.4S cmge v16.4S, v2.4S, v30.4S @@ -318,6 +295,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v2.4S, v16.4S, v29.4S mla v3.4S, v17.4S, v29.4S + str q2, [src, #2*64] + str q3, [src, #3*64] + cmge v18.4S, v31.4S, v4.4S cmge v19.4S, v31.4S, v5.4S cmge v16.4S, v4.4S, v30.4S @@ -329,6 +309,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v4.4S, v16.4S, v29.4S mla v5.4S, v17.4S, v29.4S + str q4, [src, #4*64] + str q5, [src, #5*64] + cmge v18.4S, v31.4S, v6.4S cmge v19.4S, v31.4S, v7.4S cmge v16.4S, v6.4S, v30.4S @@ -340,6 +323,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v6.4S, v16.4S, v29.4S mla v7.4S, v17.4S, v29.4S + str q6, [src, #6*64] + str q7, [src, #7*64] + cmge v18.4S, v31.4S, v8.4S cmge v19.4S, v31.4S, v9.4S cmge v16.4S, v8.4S, v30.4S @@ -351,6 +337,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v8.4S, v16.4S, v29.4S mla v9.4S, v17.4S, v29.4S + str q8, [src, #8*64] + str q9, [src, #9*64] + cmge v18.4S, v31.4S, v10.4S cmge v19.4S, v31.4S, v11.4S cmge v16.4S, v10.4S, v30.4S @@ -362,6 +351,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v10.4S, v16.4S, v29.4S mla v11.4S, v17.4S, v29.4S + str q10, [src, #10*64] + str q11, [src, #11*64] + cmge v18.4S, v31.4S, v12.4S cmge v19.4S, v31.4S, v13.4S cmge v16.4S, v12.4S, v30.4S @@ -373,6 +365,9 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v12.4S, v16.4S, v29.4S mla v13.4S, v17.4S, v29.4S + str q12, [src, #12*64] + str q13, [src, #13*64] + cmge v18.4S, v31.4S, v14.4S cmge v19.4S, v31.4S, v15.4S cmge v16.4S, v14.4S, v30.4S @@ -384,26 +379,11 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - sub counter, counter, #1 - cbnz counter, _intt_top_loop + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 - st1 { v0.4S}, [ src0], #16 - st1 { v1.4S}, [ src1], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v7.4S}, [ src7], #16 - - st1 { v8.4S}, [ src8], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v10.4S}, [src10], #16 - st1 {v11.4S}, [src11], #16 - st1 {v12.4S}, [src12], #16 - st1 {v13.4S}, [src13], #16 - st1 {v14.4S}, [src14], #16 - st1 {v15.4S}, [src15], #16 .unreq Q .unreq Qhalf @@ -412,41 +392,23 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top: .unreq invNR2dp .unreq invNWR2ph .unreq invNWR2dp - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot -PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot: -_PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot +PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot: +_PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot: push_all Q .req w20 RphRdp .req x21 src0 .req x0 - des0 .req x1 src1 .req x2 - des1 .req x3 table0 .req x28 table1 .req x27 counter .req x19 @@ -457,72 +419,175 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot: add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + add src1, src0, #512 - add des0, src0, #0 - add des1, src0, #512 + ldr q8, [table0, #4*16] + ldr q9, [table0, #5*16] + ldr q10, [table0, #6*16] + ldr q11, [table0, #7*16] - mov counter, #8 - _intt_bot_loop: + ldr q24, [table1, #4*16] + ldr q25, [table1, #5*16] + ldr q26, [table1, #6*16] + ldr q27, [table1, #7*16] + + ldr q0, [src0, # 0*16] + ldr q1, [src0, # 1*16] + + ldr q16, [src1, # 0*16] + ldr q17, [src1, # 1*16] - ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld4 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + ldr q2, [src0, # 2*16] + ldr q3, [src0, # 3*16] - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + ldr q18, [src1, # 2*16] + ldr q19, [src1, # 3*16] + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 mov v4.S[0], Q mov v20.D[0], RphRdp dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 - dq_butterfly_vec_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 - dq_butterfly_vec_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 - dq_butterfly_vec_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_top v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + mov counter, #7 + _intt_bot_loop: + + dq_butterfly_vec_top_ltrn_4x4 \ + v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + table0, \ + q8, q9, q10, q11, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16), \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q24, q25, q26, q27, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16) dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 - dq_butterfly_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + str q2, [src0, # 2*16] srshr v14.4S, v0.4S, #23 + ldr q2, [src0, #(64+ 2*16)] + str q3, [src0, # 3*16] srshr v15.4S, v1.4S, #23 + ldr q3, [src0, #(64+ 3*16)] + str q18, [src1, # 2*16] srshr v30.4S, v16.4S, #23 + ldr q18, [src1, #(64+ 2*16)] + str q19, [src1, # 3*16] srshr v31.4S, v17.4S, #23 + ldr q19, [src1, #(64+ 3*16)] mls v0.4S, v14.4S, v4.S[0] + str q0, [src0, # 0*16] + ldr q0, [src0, #(64+ 0*16)] mls v1.4S, v15.4S, v4.S[0] + str q1, [src0, # 1*16] + ldr q1, [src0, #(64+ 1*16)] mls v16.4S, v30.4S, v4.S[0] + str q16, [src1, # 0*16] + ldr q16, [src1, #(64+ 0*16)] mls v17.4S, v31.4S, v4.S[0] + str q17, [src1, # 1*16] + ldr q17, [src1, #(64+ 1*16)] + + add table0, table0, #128 + add table1, table1, #128 - st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st1 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 + + mov v4.S[0], Q + mov v20.D[0], RphRdp + + dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 sub counter, counter, #1 cbnz counter, _intt_bot_loop + dq_butterfly_vec_top_trn_4x4 \ + v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + + str q2, [src0, # 2*16] + str q3, [src0, # 3*16] + str q18, [src1, # 2*16] + str q19, [src1, # 3*16] + + srshr v14.4S, v0.4S, #23 + srshr v15.4S, v1.4S, #23 + srshr v30.4S, v16.4S, #23 + srshr v31.4S, v17.4S, #23 + + mls v0.4S, v14.4S, v4.S[0] + mls v1.4S, v15.4S, v4.S[0] + mls v16.4S, v30.4S, v4.S[0] + mls v17.4S, v31.4S, v4.S[0] + + str q0, [src0, # 0*16] + str q1, [src0, # 1*16] + str q16, [src1, # 0*16] + str q17, [src1, # 1*16] + + add table0, table0, #128 + add table1, table1, #128 + + add src0, src0, #64 + add src1, src1, #64 + .unreq Q .unreq RphRdp .unreq src0 - .unreq des0 .unreq src1 - .unreq des1 .unreq table0 .unreq table1 .unreq counter pop_all - br lr - - + ret diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_poly.S b/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_poly.S index 245866626..e1225c598 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_poly.S +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,10 +32,10 @@ #include "params.h" .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32 -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32 -PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: -_PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32 +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32 +PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32: +_PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32: mov x7, #16 _10_to_32_loop: @@ -45,7 +48,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: str w4, [x0], #4 ubfx w5, w2, #20, #10 str w5, [x0], #4 - lsr w6, w2, #30 + lsr w6, w2, #30 ldr w2, [x1], #4 @@ -99,13 +102,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32: sub x7, x7, #1 cbnz x7, _10_to_32_loop - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce: ldr w4, [x1] @@ -117,7 +120,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 - + ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 @@ -126,7 +129,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -135,7 +138,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -192,13 +195,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq: ldr w4, [x1] @@ -285,13 +288,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze: ldr w4, [x1] @@ -312,7 +315,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -321,7 +324,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S @@ -330,7 +333,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 - + sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 @@ -339,7 +342,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -414,13 +417,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round: mov w4, #1 @@ -560,13 +563,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round: st1 {v30.4S}, [x1], #16 st1 {v31.4S}, [x1], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_add: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -609,13 +612,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_add: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_sub: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -632,7 +635,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: mov x16, #15 _poly_sub_loop: - + st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -658,13 +661,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_sub: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_shiftl: add x1, x0, #0 @@ -725,13 +728,13 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_shiftl: st1 {v22.4S}, [x0], #16 st1 {v23.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery -PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: -_PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery +PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery: +_PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery: push_all @@ -769,14 +772,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -819,14 +822,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -843,14 +846,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery: pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery -.global _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery -PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: -_PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: +.global PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery +.global _PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery +PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery: +_PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery: push_all @@ -910,90 +913,90 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1027,14 +1030,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v27.4S, v23.4S, v31.4S ld1 { v7.4S}, [x2], #16 - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1064,90 +1067,90 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1173,14 +1176,14 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1194,7 +1197,7 @@ _PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery: pop_all - br lr + ret diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/api.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/api.h index c8dd59a76..77b1e37d4 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/api.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/api.h @@ -12,8 +12,8 @@ #define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES 1312 #define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES 2560 -#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES 2420 -#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_ALGNAME "Dilithium2" +#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES 2420 +#define PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_ALGNAME "Dilithium2" int PQCLEAN_DILITHIUM2_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/feat.S b/Modules/PQClean/crypto_sign/dilithium2/aarch64/feat.S deleted file mode 100644 index 63be5df6c..000000000 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global PQCLEAN_DILITHIUM2_AARCH64_f1600x2 -.global _PQCLEAN_DILITHIUM2_AARCH64_f1600x2 -PQCLEAN_DILITHIUM2_AARCH64_f1600x2: -_PQCLEAN_DILITHIUM2_AARCH64_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/fips202x2.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/fips202x2.c deleted file mode 100644 index 2567f381c..000000000 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/fips202x2.c +++ /dev/null @@ -1,684 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We choose - * CC0 1.0 Universal or the following MIT License for this file. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_DILITHIUM2_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/fips202x2.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/fips202x2.h deleted file mode 100644 index 28babbc39..000000000 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/fips202x2.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include "params.h" -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -#define shake128x2_absorb DILITHIUM_NAMESPACE(shake128x2_absorb) -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake128x2_squeezeblocks DILITHIUM_NAMESPACE(shake128x2_squeezeblocks) -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake256x2_absorb DILITHIUM_NAMESPACE(shake256x2_absorb) -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2_squeezeblocks DILITHIUM_NAMESPACE(shake256x2_squeezeblocks) -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake128x2 DILITHIUM_NAMESPACE(shake128x2) -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2 DILITHIUM_NAMESPACE(shake256x2) -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); -#endif diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros.inc b/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros.inc index ef3af4c54..5504405c1 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros.inc +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,24 +30,254 @@ #include "macros_common.inc" -.macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - trn1 \t0\qS, \a0\qS, \a1\qS - trn2 \t1\qS, \a0\qS, \a1\qS - trn1 \t2\qS, \a2\qS, \a3\qS - trn2 \t3\qS, \a2\qS, \a3\qS + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S - trn1 \a0\dD, \t0\dD, \t2\dD - trn2 \a2\dD, \t0\dD, \t2\dD - trn1 \a1\dD, \t1\dD, \t3\dD - trn2 \a3\dD, \t1\dD, \t3\dD + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D .endm -.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - wrap_trn_4x4 \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, .4S, .2D + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + +.macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + +.macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 .endm +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + + +.macro do_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== .macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S @@ -54,12 +287,20 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 .endm -.macro dq_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S .endm @@ -67,16 +308,32 @@ wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + .macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 .endm -.macro dq_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm @@ -89,16 +346,48 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + .macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm -.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 .endm -.macro qq_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S .endm @@ -109,3 +398,5 @@ .macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S .endm + +// === 32-bit end ==== diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros_common.inc b/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros_common.inc index bd7e77eb9..07568491d 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros_common.inc +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,35 +28,58 @@ * SOFTWARE. */ +#ifndef MACROS_COMMON +#define MACROS_COMMON + // for ABI .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -72,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -82,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -99,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -135,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -149,7 +310,134 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t4\wX, \b4\wX, \z4\nX[\h4] @@ -176,7 +464,186 @@ .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] + mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] + mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] + mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -218,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -228,15 +769,82 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] sub \b1\wX, \a1\wX, \t1\wX mul \t3\wX, \b3\wX, \h3\wX + ldr \c2, [\srcc_ptr, \memc2] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b3\wX, \b3\wX, \l3\wX @@ -245,7 +853,7 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX @@ -262,57 +870,98 @@ .endm +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + // vector-scalar Barrett reduction .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -391,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX @@ -448,3 +1191,10 @@ add \s3\wX, \a3\wX, \b3\wX .endm + + +#endif + + + + diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.c index 2d88c5d5e..ec594a77c 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.c +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,12 +33,26 @@ */ #include "params.h" -#include "reduce.h" +#include "NTT_params.h" +#include "ntt.h" #include #include -#include "NTT_params.h" -#include "ntt.h" +const __attribute__ ((aligned (16)))int32_t constants[16] = { + Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, + invNQ1R2modQ1_prime_half, + invNQ1R2modQ1_doubleprime, + invNQ1_final_R2modQ1_prime_half, + invNQ1_final_R2modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { + 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { + 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 +}; /************************************************* * Name: ntt diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.h index 86796ca8c..5c85aa48a 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/ntt.h @@ -1,17 +1,19 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_NTT_H +#define PQCLEAN_DILITHIUM2_AARCH64_NTT_H /* * This file was originally licensed * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,45 +34,42 @@ * SOFTWARE. */ -#include "NTT_params.h" #include "params.h" +#include "NTT_params.h" #include -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants); +#define constants DILITHIUM_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define streamlined_GS_itable_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_GS_itable_Q1_jump_extended) + +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); + +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); + +extern +const int32_t constants[16]; -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants); +extern +const int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define NTT(in) { \ - PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM2_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define iNTT(in) { \ - PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM2_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +#define NTT(in) do { \ + PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM2_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_bot(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM2_AARCH64__asm_intt_SIMD_top(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + } while(0) #define ntt DILITHIUM_NAMESPACE(ntt) void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); -static const int constants[16] = { - Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, - invNQ1R2modQ1_prime_half, - invNQ1R2modQ1_doubleprime, - invNQ1_final_R2modQ1_prime_half, - invNQ1_final_R2modQ1_doubleprime -}; - -static const int streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 0, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 3073009, 327391679, 1277625, -675340520, -2635473, 987079667, 3852015, 449207, 1753, -495951789, -1935420, -681503850, -2659525, -373072124, -1455890, 681730119, 2660408, -456183549, -1780227, -15156688, -59148, 710479343, 2772600, 0, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, 4183372, -825844983, -3222807, -799869667, -3121440, -70227934, -274060, 302950022, 1182243, 22347069, 87208, 163212680, 636927, -1016110510, -3965306, -1013916752, -3956745, -588452222, -2296397, -841760171, -3284915, -952468207, -3716946, 0, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 2508980, 519705671, 2028118, 496502727, 1937570, -977780347, -3815725, -7126831, -27812, 210776307, 822541, 258649997, 1009365, -628875181, -2454145, -507246529, -1979497, 409185979, 1596822, -1013967746, -3956944, -963363710, -3759465, 0, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, 2811291, -764594519, -2983781, -284313712, -1109516, 1065510939, 4158088, -431820817, -1685153, -873958779, -3410568, 686309310, 2678278, -965793731, -3768948, -909946047, -3551006, 162963861, 635956, -64176841, -250446, -629190881, -2455377, 0, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 1528066, 123678909, 482649, 294395108, 1148858, -759080783, -2962264, -1062481036, -4146264, -454226054, -1772588, 561940831, 2192938, -442566669, -1727088, 611800717, 2387513, -925511710, -3611750, -68791907, -268456, -814992530, -3180456, 0, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, -565603, 43482586, 169688, 631001801, 2462444, -854436357, -3334383, 960233614, 3747250, 588375860, 2296099, 317727459, 1239911, -983611064, -3838479, 818892658, 3195676, 677264190, 2642980, 321386456, 1254190, -3181859, -12417, 0, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -4166425, -893898890, -3488383, 509377762, 1987814, -819295484, -3197248, 768294260, 2998219, 36345249, 141835, -22883400, -89301, 643961400, 2513018, -347191365, -1354892, 157142369, 613238, -335754661, -1310261, -568482643, -2218467, 0, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 1736313, 60323094, 235407, -832852657, -3250154, 834980303, 3258457, -117552223, -458740, -492511373, -1921994, 1035301089, 4040196, -889718424, -3472069, 522531086, 2039144, -481719139, -1879878, -209807681, -818761, -558360247, -2178965, 0, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, -2579253, 458160776, 1787943, -612717067, -2391089, -577774276, -2254727, -415984810, -1623354, 539479988, 2105286, -608441020, -2374402, -521163479, -2033807, 150224382, 586241, -302276083, -1179613, 135295244, 527981, -702999655, -2743411, 0, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, 3482206, -1071872863, -4182915, -333129378, -1300016, -605279149, -2362063, -378477722, -1476985, 510974714, 1994046, 638402564, 2491325, -356997292, -1393159, 130156402, 507927, -304395785, -1187885, -185731180, -724804, -470097680, -1834526, 0, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, -1317678, 630730945, 2461387, 777970524, 3035980, 159173408, 621164, -777397036, -3033742, -86720197, -338420, 678549029, 2647994, 771248568, 3009748, -669544140, -2612853, 1063046068, 4148469, 192079267, 749577, -1030830548, -4022750, 0, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, 3901472, -314332144, -1226661, 749740976, 2925816, 864652284, 3374250, 1020029345, 3980599, 658309618, 2569011, -413979908, -1615530, 441577800, 1723229, 426738094, 1665318, 519685171, 2028038, 298172236, 1163598, -863376927, -3369273, 0, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, 1356448, -711287812, -2775755, 687588511, 2683270, -712065019, -2778788, 1023635298, 3994671, -3043996, -11879, -351195274, -1370517, 773976352, 3020393, 861908357, 3363542, 55063046, 214880, 139752717, 545376, -197425671, -770441, 0, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, -3467665, 592665232, 2312838, -167401858, -653275, -117660617, -459163, 795799901, 3105558, -282732136, -1103344, 130212265, 508145, -141890356, -553718, 220412084, 860144, 879049958, 3430436, 35937555, 140244, -388001774, -1514152, 0, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, 348812, -84011120, -327848, 259126110, 1011223, -603268097, -2354215, -559928242, -2185084, 800464680, 3123762, 604333585, 2358373, -561979013, -2193087, -772445769, -3014420, -439933955, -1716814, 749801963, 2926054, -100631253, -392707, 0, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -3818627, -492577742, -1922253, -573161516, -2236726, 447030292, 1744507, -77645096, -303005, 904878186, 3531229, -1018462631, -3974485, -967019376, -3773731, 486888731, 1900052, -200355636, -781875, 270210213, 1054478, -187430119, -731434, 0, 0 -}; - -static const int streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 0, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, -1744507, 573161516, 2236726, 492577742, 1922253, 978523985, 3818627, 187430119, 731434, -270210213, -1054478, 200355636, 781875, -486888731, -1900052, 967019376, 3773731, 1018462631, 3974485, -904878186, -3531229, 77645096, 303005, 0, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, 2354215, -259126110, -1011223, 84011120, 327848, -89383150, -348812, 100631253, 392707, -749801963, -2926054, 439933955, 1716814, 772445769, 3014420, 561979013, 2193087, -604333585, -2358373, -800464680, -3123762, 559928242, 2185084, 0, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 459163, 167401858, 653275, -592665232, -2312838, 888589898, 3467665, 388001774, 1514152, -35937555, -140244, -879049958, -3430436, -220412084, -860144, 141890356, 553718, -130212265, -508145, 282732136, 1103344, -795799901, -3105558, 0, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, 2778788, -687588511, -2683270, 711287812, 2775755, -347590090, -1356448, 197425671, 770441, -139752717, -545376, -55063046, -214880, -861908357, -3363542, -773976352, -3020393, 351195274, 1370517, 3043996, 11879, -1023635298, -3994671, 0, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -3374250, -749740976, -2925816, 314332144, 1226661, -999753034, -3901472, 863376927, 3369273, -298172236, -1163598, -519685171, -2028038, -426738094, -1665318, -441577800, -1723229, 413979908, 1615530, -658309618, -2569011, -1020029345, -3980599, 0, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -621164, -777970524, -3035980, -630730945, -2461387, 337655269, 1317678, 1030830548, 4022750, -192079267, -749577, -1063046068, -4148469, 669544140, 2612853, -771248568, -3009748, -678549029, -2647994, 86720197, 338420, 777397036, 3033742, 0, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 2362063, 333129378, 1300016, 1071872863, 4182915, -892316032, -3482206, 470097680, 1834526, 185731180, 724804, 304395785, 1187885, -130156402, -507927, 356997292, 1393159, -638402564, -2491325, -510974714, -1994046, 378477722, 1476985, 0, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 2254727, 612717067, 2391089, -458160776, -1787943, 660934133, 2579253, 702999655, 2743411, -135295244, -527981, 302276083, 1179613, -150224382, -586241, 521163479, 2033807, 608441020, 2374402, -539479988, -2105286, 415984810, 1623354, 0, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, -3258457, 832852657, 3250154, -60323094, -235407, -444930577, -1736313, 558360247, 2178965, 209807681, 818761, 481719139, 1879878, -522531086, -2039144, 889718424, 3472069, -1035301089, -4040196, 492511373, 1921994, 117552223, 458740, 0, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, 3197248, -509377762, -1987814, 893898890, 3488383, 1067647297, 4166425, 568482643, 2218467, 335754661, 1310261, -157142369, -613238, 347191365, 1354892, -643961400, -2513018, 22883400, 89301, -36345249, -141835, -768294260, -2998219, 0, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, 3334383, -631001801, -2462444, -43482586, -169688, 144935890, 565603, 3181859, 12417, -321386456, -1254190, -677264190, -2642980, -818892658, -3195676, 983611064, 3838479, -317727459, -1239911, -588375860, -2296099, -960233614, -3747250, 0, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, 2962264, -294395108, -1148858, -123678909, -482649, -391567239, -1528066, 814992530, 3180456, 68791907, 268456, 925511710, 3611750, -611800717, -2387513, 442566669, 1727088, -561940831, -2192938, 454226054, 1772588, 1062481036, 4146264, 0, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, -4158088, 284313712, 1109516, 764594519, 2983781, -720393920, -2811291, 629190881, 2455377, 64176841, 250446, -162963861, -635956, 909946047, 3551006, 965793731, 3768948, -686309310, -2678278, 873958779, 3410568, 431820817, 1685153, 0, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, 3815725, -496502727, -1937570, -519705671, -2028118, -642926661, -2508980, 963363710, 3759465, 1013967746, 3956944, -409185979, -1596822, 507246529, 1979497, 628875181, 2454145, -258649997, -1009365, -210776307, -822541, 7126831, 27812, 0, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 274060, 799869667, 3121440, 825844983, 3222807, -1071989969, -4183372, 952468207, 3716946, 841760171, 3284915, 588452222, 2296397, 1013916752, 3956745, 1016110510, 3965306, -163212680, -636927, -22347069, -87208, -302950022, -1182243, 0, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, -3852015, 675340520, 2635473, -327391679, -1277625, -787459213, -3073009, -710479343, -2772600, 15156688, 59148, 456183549, 1780227, -681730119, -2660408, 373072124, 1455890, 681503850, 2659525, 495951789, 1935420, -449207, -1753, 0, 0 -}; - #endif diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.c index 8fa3b0ccb..1d46d7a6e 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.c +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.c @@ -19,7 +19,7 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1) { unsigned int i; @@ -45,7 +45,7 @@ void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], **************************************************/ void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) { + const uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -71,7 +71,7 @@ void unpack_pk(uint8_t rho[SEEDBYTES], * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 **************************************************/ -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -129,7 +129,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]) { + const uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -172,7 +172,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h **************************************************/ -void pack_sig(uint8_t sig[CRYPTO_BYTES], +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h) { @@ -221,7 +221,7 @@ void pack_sig(uint8_t sig[CRYPTO_BYTES], int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, - const uint8_t sig[CRYPTO_BYTES]) { + const uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES]) { unsigned int i, j, k; for (i = 0; i < CTILDEBYTES; ++i) { diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.h index fb70ce5db..162b4be39 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/packing.h @@ -1,5 +1,5 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_PACKING_H +#define PQCLEAN_DILITHIUM2_AARCH64_PACKING_H /* * This file is dual licensed @@ -7,15 +7,16 @@ * or public domain at https://github.com/pq-crystals/dilithium */ +#include "api.h" #include "params.h" #include "polyvec.h" #include #define pack_pk DILITHIUM_NAMESPACE(pack_pk) -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); #define pack_sk DILITHIUM_NAMESPACE(pack_sk) -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -24,10 +25,10 @@ void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], const polyveck *s2); #define pack_sig DILITHIUM_NAMESPACE(pack_sig) -void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); #define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) -void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES]); #define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) void unpack_sk(uint8_t rho[SEEDBYTES], @@ -36,9 +37,9 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + const uint8_t sk[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_SECRETKEYBYTES]); #define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) -int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES]); #endif diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/params.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/params.h index 7601e765e..287ca9ddf 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/params.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_PARAMS_H +#define PQCLEAN_DILITHIUM2_AARCH64_PARAMS_H /* * This file is dual licensed @@ -8,11 +8,11 @@ */ #define DILITHIUM_MODE 2 -//#define DILITHIUM_MODE 3 -//#define DILITHIUM_MODE 5 +// #define DILITHIUM_MODE 3 +// #define DILITHIUM_MODE 5 -#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM2_AARCH64_crypto_sign #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM2_AARCH64_##s +#define CRYPTO_NAMESPACETOP crypto_sign #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s) @@ -40,18 +40,20 @@ #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) +// GAMMA1 == (1 << 17) #define POLYZ_PACKEDBYTES 576 +// GAMMA2 == (DILITHIUM_Q-1)/88 #define POLYW1_PACKEDBYTES 192 #define POLYETA_PACKEDBYTES 96 -#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) -#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ - + TRBYTES \ - + L*POLYETA_PACKEDBYTES \ - + K*POLYETA_PACKEDBYTES \ - + K*POLYT0_PACKEDBYTES) -#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) +#define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define DILITHIUM_CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.c index d2c371ba9..103f294e6 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.c +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -35,16 +37,9 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" -#include - -#include "fips202x2.h" - -#include "NTT_params.h" +#include "keccak2x/fips202x2.h" #include "ntt.h" - -static const int32_t montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; +#include #define DBENCH_START() #define DBENCH_STOP(t) @@ -57,11 +52,11 @@ static const int32_t montgomery_const[4] = { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(int32_t *, const int32_t *); void poly_reduce(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_reduce(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -74,11 +69,11 @@ void poly_reduce(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(int32_t *, const int32_t *); void poly_caddq(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_caddq(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -91,11 +86,11 @@ void poly_caddq(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(int32_t *, const int32_t *); void poly_freeze(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_freeze(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -205,11 +200,11 @@ void poly_invntt_tomont(poly *a) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, constants); DBENCH_STOP(*tmul); } @@ -226,11 +221,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { * - poly *a0: pointer to output polynomial with coefficients c0 * - const poly *a: pointer to input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *); void poly_power2round(poly *a1, poly *a0, const poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); + PQCLEAN_DILITHIUM2_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); DBENCH_STOP(*tround); } @@ -730,11 +725,11 @@ void polyt1_pack(uint8_t *r, const poly *a) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32(int32_t *, const uint8_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(int32_t *, const uint8_t *); void polyt1_unpack(poly *r, const uint8_t *a) { DBENCH_START(); - PQCLEAN_DILITHIUM2_AARCH64_asm_10_to_32(r->coeffs, a); + PQCLEAN_DILITHIUM2_AARCH64__asm_10_to_32(r->coeffs, a); DBENCH_STOP(*tpack); } diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.h index c253ecf69..a6a8936d5 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_POLY_H +#define PQCLEAN_DILITHIUM2_AARCH64_POLY_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.c index 2018807bf..019c06380 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.c +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,13 +35,9 @@ #include "params.h" #include "poly.h" #include "polyvec.h" -#include - +#include "ntt.h" #include "reduce.h" - -static const int32_t l_montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; +#include /************************************************* * Name: expand_mat @@ -177,11 +175,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -extern void PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { - PQCLEAN_DILITHIUM2_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const); + PQCLEAN_DILITHIUM2_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, constants); } /************************************************* diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.h index dc3377c93..3e7458666 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_POLYVEC_H +#define PQCLEAN_DILITHIUM2_AARCH64_POLYVEC_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/reduce.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/reduce.h index 9042e6cb0..721feb67b 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/reduce.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_REDUCE_H +#define PQCLEAN_DILITHIUM2_AARCH64_REDUCE_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.c index f5efb266c..1ee5075a5 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.c +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.c @@ -95,7 +95,8 @@ int32_t use_hint(int32_t a, unsigned int hint) { if (a0 > 0) { return (a1 == 43) ? 0 : a1 + 1; + } else { + return (a1 == 0) ? 43 : a1 - 1; } - return (a1 == 0) ? 43 : a1 - 1; } diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.h index 36167d2af..f581543d0 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_ROUNDING_H +#define PQCLEAN_DILITHIUM2_AARCH64_ROUNDING_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.c index 3565b3704..ca92eb185 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.c +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -90,7 +91,7 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { pack_pk(pk, rho, &t1); /* Compute H(rho, t1) and write secret key */ - shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(tr, TRBYTES, pk, PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES); pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; @@ -210,7 +211,7 @@ int crypto_sign_signature(uint8_t *sig, /* Write signature */ pack_sig(sig, sig, &z, &h); - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES; return 0; } @@ -238,9 +239,9 @@ int crypto_sign(uint8_t *sm, size_t i; for (i = 0; i < mlen; ++i) { - sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + sm[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; } - crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES, mlen, sk); *smlen += mlen; return 0; } @@ -274,7 +275,7 @@ int crypto_sign_verify(const uint8_t *sig, polyveck t1, w1, h; shake256incctx state; - if (siglen != CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES) { return -1; } @@ -287,7 +288,7 @@ int crypto_sign_verify(const uint8_t *sig, } /* Compute CRH(H(rho, t1), msg) */ - shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(mu, CRHBYTES, pk, PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_PUBLICKEYBYTES); shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -353,17 +354,17 @@ int crypto_sign_open(uint8_t *m, const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; - if (crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES; + if (crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM2_AARCH64_CRYPTO_BYTES + i]; } return 0; } diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.h index bc8c42658..8b8a52832 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/sign.h @@ -1,5 +1,5 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_SIGN_H +#define PQCLEAN_DILITHIUM2_AARCH64_SIGN_H /* * This file is dual licensed @@ -24,7 +24,7 @@ int crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); -#define crypto_sign DILITHIUM_NAMESPACETOP +#define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk); diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric-shake.c b/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric-shake.c index a53074aac..53aab1c94 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric-shake.c +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric-shake.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric.h b/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric.h index 40b928ec6..7d3aa1a79 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric.h +++ b/Modules/PQClean/crypto_sign/dilithium2/aarch64/symmetric.h @@ -1,13 +1,14 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM2_AARCH64_SYMMETRIC_H +#define PQCLEAN_DILITHIUM2_AARCH64_SYMMETRIC_H /* * This file was originally licensed * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -33,7 +34,7 @@ */ #include "fips202.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "params.h" #include diff --git a/Modules/PQClean/crypto_sign/dilithium2/clean/sign.c b/Modules/PQClean/crypto_sign/dilithium2/clean/sign.c index 93a137dcc..8d04fefee 100644 --- a/Modules/PQClean/crypto_sign/dilithium2/clean/sign.c +++ b/Modules/PQClean/crypto_sign/dilithium2/clean/sign.c @@ -337,7 +337,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, badsig: /* Signature verification failed */ - *mlen = -1; + *mlen = (size_t) -1; for (i = 0; i < smlen; ++i) { m[i] = 0; } diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/LICENSE b/Modules/PQClean/crypto_sign/dilithium3/aarch64/LICENSE index 0e259d42c..093b0a7db 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/LICENSE +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/Makefile b/Modules/PQClean/crypto_sign/dilithium3/aarch64/Makefile index 3c2ad454b..f1b97b4f3 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/Makefile +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/Makefile @@ -1,12 +1,16 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium3_aarch64.a -HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h -OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o feat.o +HEADERS=api.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) + all: $(LIB) %.o: %.c $(HEADERS) @@ -15,8 +19,11 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) $(HEADERS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/NTT_params.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/NTT_params.h index 582c16ed5..053c3677f 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/NTT_params.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/NTT_params.h @@ -1,8 +1,10 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_NTT_PARAMS_H +#define PQCLEAN_DILITHIUM3_AARCH64_NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_NTT.S b/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_NTT.S index 0c7732d27..fad817300 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_NTT.S +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,261 +30,413 @@ #include "macros.inc" -.align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top -PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top: -_PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top: - - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 - - ldr Q, [x2] - - mov table, x1 - - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 - - add src11, src0, #704 - add src12, src0, #768 +#include "params.h" - add src13, src0, #832 - add src14, src0, #896 +.align 2 +.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top +PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top: +_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top: - add src15, src0, #960 + push_simd + Q .req w8 + src .req x0 + counter .req x11 - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 + ldr Q, [x2] mov v20.S[0], Q - ld1 { v1.4S}, [ src1] - ld1 { v3.4S}, [ src3] - ld1 { v5.4S}, [ src5] - ld1 { v7.4S}, [ src7] - ld1 { v9.4S}, [ src9] - ld1 {v11.4S}, [src11] - ld1 {v13.4S}, [src13] - ld1 {v15.4S}, [src15] - - ld1 { v0.4S}, [ src0] - ld1 { v2.4S}, [ src2] - ld1 { v4.4S}, [ src4] - ld1 { v6.4S}, [ src6] - ld1 { v8.4S}, [ src8] - ld1 {v10.4S}, [src10] - ld1 {v12.4S}, [src12] - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + ldr q9, [src, #9*64] + ldr q11, [src, #11*64] + ldr q13, [src, #13*64] + ldr q15, [src, #15*64] + + qq_butterfly_topl \ + v9, v11, v13, v15, v16, v17, v18, v19, v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64 + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) mov counter, #3 _ntt_top_loop: - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mixssl \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + v9, v11, v13, v15, v16, v17, v18, v19, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64, \ + src, \ + q1, q3, q5, q7, \ + #(16+1*64), #(16+3*64), #(16+5*64), #(16+7*64) + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #(16+8*64), #(16+10*64), #(16+12*64), #(16+14*64), \ + src, \ + q0, q2, q4, q6, \ + #(16+0*64), #(16+2*64), #(16+4*64), #(16+6*64) + + add src, src, #16 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) sub counter, counter, #1 cbnz counter, _ntt_top_loop - st1 { v1.4S}, [ src1], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v7.4S}, [ src7], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v11.4S}, [src11], #16 - st1 {v13.4S}, [src13], #16 - st1 {v15.4S}, [src15], #16 - - st1 { v0.4S}, [ src0], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v8.4S}, [ src8], #16 - st1 {v10.4S}, [src10], #16 - st1 {v12.4S}, [src12], #16 - st1 {v14.4S}, [src14], #16 + qq_butterfly_botss \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter - pop_all + pop_simd - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot -PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot: -_PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot: - - push_all - Q .req w20 - src0 .req x0 - des0 .req x1 - src1 .req x2 - des1 .req x3 - table0 .req x28 - table1 .req x27 - counter .req x19 +.global PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot +PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot: +_PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot: + + push_simd + Q .req w8 + src .req x0 + table0 .req x9 + table1 .req x10 + counter .req x11 ldr Q, [x2] add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + ldr q0, [src, #0*16] + ldr q1, [src, #1*16] + ldr q2, [src, #2*16] + ldr q3, [src, #3*16] + + ldr q4, [table0, #0*16] + ldr q5, [table0, #1*16] + ldr q20, [table1, #0*16] + ldr q21, [table1, #1*16] + + dq_butterfly_topl4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 + + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + - add des0, src0, #0 - add des1, src0, #512 + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - mov counter, #8 + str q0, [src, #0*16] + str q2, [src, #2*16] + + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 + + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + mov counter, #3 _ntt_bot_loop: - ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld1 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + trn_4x4_l4 v12, v13, v14, v15, v8, v9, v10, v11, src, q0, q1, q2, q3, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + str q12, [src, #0*16] + str q13, [src, #1*16] - mov v4.S[0], Q + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - dq_butterfly_top v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 - dq_butterfly_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + str q14, [src, #2*16] + str q15, [src, #3*16] + + + add src, src, #64 + + trn_4x4_l4 v28, v29, v30, v31, v24, v25, v26, v27, src, q16, q17, q18, q19, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q28, q29, q30, q31, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + add table0, table0, #128 + add table1, table1, #128 + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 - dq_butterfly_vec_top v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7 - dq_butterfly_vec_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 - dq_butterfly_vec_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) + + str q0, [src, #0*16] + str q2, [src, #2*16] + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 - st4 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st4 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - .unreq Q - .unreq src0 - .unreq des0 - .unreq src1 - .unreq des1 - .unreq table0 - .unreq table1 - .unreq counter - pop_all + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - br lr + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + trn_4x4_s4 v28, v29, v30, v31, v16, v17, v18, v19, src, q12, q13, q14, q15, #0*16, #1*16, #2*16, #3*16 + str q28, [src, #(512+0*16)] + str q29, [src, #(512+1*16)] + str q30, [src, #(512+2*16)] + str q31, [src, #(512+3*16)] + add src, src, #64 + .unreq Q + .unreq src + .unreq table0 + .unreq table1 + .unreq counter + pop_simd + ret diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_iNTT.S b/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_iNTT.S index 7c05e2ec8..49dbca853 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_iNTT.S +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top -PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: -_PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top +PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top: +_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top: push_all Q .req w20 @@ -41,23 +44,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: invNR2dp .req w25 invNWR2ph .req w26 invNWR2dp .req w27 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 counter .req x19 ldr Q, [x2, #0] @@ -69,77 +56,63 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: ldr invNWR2ph, [x2, #24] ldr invNWR2dp, [x2, #28] - mov table, x1 + ldr q20, [x1, #0*16] + ldr q21, [x1, #1*16] + ldr q22, [x1, #2*16] + ldr q23, [x1, #3*16] + ldr q24, [x1, #4*16] + ldr q25, [x1, #5*16] + ldr q26, [x1, #6*16] + ldr q27, [x1, #7*16] - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 + mov v20.S[0], Q - add src11, src0, #704 - add src12, src0, #768 + ldr q0, [src, # 0*64] + ldr q1, [src, # 1*64] - add src13, src0, #832 - add src14, src0, #896 + ldr q2, [src, # 2*64] + ldr q3, [src, # 3*64] - add src15, src0, #960 + ldr q4, [src, # 4*64] + ldr q5, [src, # 5*64] - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ldr q6, [src, # 6*64] + ldr q7, [src, # 7*64] - mov v20.S[0], Q + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 - ld1 { v0.4S}, [ src0] - ld1 { v1.4S}, [ src1] - ld1 { v2.4S}, [ src2] - ld1 { v3.4S}, [ src3] - ld1 { v4.4S}, [ src4] - ld1 { v5.4S}, [ src5] - ld1 { v6.4S}, [ src6] - ld1 { v7.4S}, [ src7] - - ld1 { v8.4S}, [ src8] - ld1 { v9.4S}, [ src9] - ld1 {v10.4S}, [src10] - ld1 {v11.4S}, [src11] - ld1 {v12.4S}, [src12] - ld1 {v13.4S}, [src13] - ld1 {v14.4S}, [src14] - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + mov counter, #3 + _intt_top_loop: + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -153,77 +126,99 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: sub v17.4S, v17.4S, v19.4S mla v0.4S, v16.4S, v29.4S - mla v1.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v2.4S + mla v1.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v3.4S + + str q0, [src, #0*64] cmge v16.4S, v2.4S, v30.4S + ldr q0, [src, #(16 + 0*64)] + str q1, [src, #1*64] cmge v17.4S, v3.4S, v30.4S + ldr q1, [src, #(16 + 1*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v2.4S, v16.4S, v29.4S - mla v3.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v4.4S + mla v3.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v5.4S + + str q2, [src, #2*64] cmge v16.4S, v4.4S, v30.4S + ldr q2, [src, #(16 + 2*64)] + str q3, [src, #3*64] cmge v17.4S, v5.4S, v30.4S + ldr q3, [src, #(16 + 3*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v4.4S, v16.4S, v29.4S - mla v5.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v6.4S + mla v5.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v7.4S + + str q4, [src, #4*64] cmge v16.4S, v6.4S, v30.4S + ldr q4, [src, #(16 + 4*64)] + str q5, [src, #5*64] cmge v17.4S, v7.4S, v30.4S + ldr q5, [src, #(16 + 5*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v6.4S, v16.4S, v29.4S - mla v7.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v8.4S + mla v7.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v9.4S + + str q6, [src, #6*64] cmge v16.4S, v8.4S, v30.4S + ldr q6, [src, #(16 + 6*64)] + str q7, [src, #7*64] cmge v17.4S, v9.4S, v30.4S + ldr q7, [src, #(16 + 7*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v8.4S, v16.4S, v29.4S - mla v9.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v10.4S + mla v9.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v11.4S + + str q8, [src, #8*64] cmge v16.4S, v10.4S, v30.4S + str q9, [src, #9*64] cmge v17.4S, v11.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v10.4S, v16.4S, v29.4S - mla v11.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v12.4S + mla v11.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v13.4S + + str q10, [src, #10*64] cmge v16.4S, v12.4S, v30.4S + str q11, [src, #11*64] cmge v17.4S, v13.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v12.4S, v16.4S, v29.4S - mla v13.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v14.4S + mla v13.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v15.4S + + str q12, [src, #12*64] cmge v16.4S, v14.4S, v30.4S + str q13, [src, #13*64] cmge v17.4S, v15.4S, v30.4S sub v16.4S, v16.4S, v18.4S @@ -232,66 +227,45 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - mov counter, #3 - _intt_top_loop: - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 + + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 + + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -307,6 +281,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v0.4S, v16.4S, v29.4S mla v1.4S, v17.4S, v29.4S + str q0, [src, #0*64] + str q1, [src, #1*64] + cmge v18.4S, v31.4S, v2.4S cmge v19.4S, v31.4S, v3.4S cmge v16.4S, v2.4S, v30.4S @@ -318,6 +295,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v2.4S, v16.4S, v29.4S mla v3.4S, v17.4S, v29.4S + str q2, [src, #2*64] + str q3, [src, #3*64] + cmge v18.4S, v31.4S, v4.4S cmge v19.4S, v31.4S, v5.4S cmge v16.4S, v4.4S, v30.4S @@ -329,6 +309,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v4.4S, v16.4S, v29.4S mla v5.4S, v17.4S, v29.4S + str q4, [src, #4*64] + str q5, [src, #5*64] + cmge v18.4S, v31.4S, v6.4S cmge v19.4S, v31.4S, v7.4S cmge v16.4S, v6.4S, v30.4S @@ -340,6 +323,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v6.4S, v16.4S, v29.4S mla v7.4S, v17.4S, v29.4S + str q6, [src, #6*64] + str q7, [src, #7*64] + cmge v18.4S, v31.4S, v8.4S cmge v19.4S, v31.4S, v9.4S cmge v16.4S, v8.4S, v30.4S @@ -351,6 +337,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v8.4S, v16.4S, v29.4S mla v9.4S, v17.4S, v29.4S + str q8, [src, #8*64] + str q9, [src, #9*64] + cmge v18.4S, v31.4S, v10.4S cmge v19.4S, v31.4S, v11.4S cmge v16.4S, v10.4S, v30.4S @@ -362,6 +351,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v10.4S, v16.4S, v29.4S mla v11.4S, v17.4S, v29.4S + str q10, [src, #10*64] + str q11, [src, #11*64] + cmge v18.4S, v31.4S, v12.4S cmge v19.4S, v31.4S, v13.4S cmge v16.4S, v12.4S, v30.4S @@ -373,6 +365,9 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v12.4S, v16.4S, v29.4S mla v13.4S, v17.4S, v29.4S + str q12, [src, #12*64] + str q13, [src, #13*64] + cmge v18.4S, v31.4S, v14.4S cmge v19.4S, v31.4S, v15.4S cmge v16.4S, v14.4S, v30.4S @@ -384,26 +379,11 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - sub counter, counter, #1 - cbnz counter, _intt_top_loop + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 - st1 { v0.4S}, [ src0], #16 - st1 { v1.4S}, [ src1], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v7.4S}, [ src7], #16 - - st1 { v8.4S}, [ src8], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v10.4S}, [src10], #16 - st1 {v11.4S}, [src11], #16 - st1 {v12.4S}, [src12], #16 - st1 {v13.4S}, [src13], #16 - st1 {v14.4S}, [src14], #16 - st1 {v15.4S}, [src15], #16 .unreq Q .unreq Qhalf @@ -412,41 +392,23 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top: .unreq invNR2dp .unreq invNWR2ph .unreq invNWR2dp - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot -PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot: -_PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot +PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot: +_PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot: push_all Q .req w20 RphRdp .req x21 src0 .req x0 - des0 .req x1 src1 .req x2 - des1 .req x3 table0 .req x28 table1 .req x27 counter .req x19 @@ -457,72 +419,175 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot: add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + add src1, src0, #512 - add des0, src0, #0 - add des1, src0, #512 + ldr q8, [table0, #4*16] + ldr q9, [table0, #5*16] + ldr q10, [table0, #6*16] + ldr q11, [table0, #7*16] - mov counter, #8 - _intt_bot_loop: + ldr q24, [table1, #4*16] + ldr q25, [table1, #5*16] + ldr q26, [table1, #6*16] + ldr q27, [table1, #7*16] + + ldr q0, [src0, # 0*16] + ldr q1, [src0, # 1*16] + + ldr q16, [src1, # 0*16] + ldr q17, [src1, # 1*16] - ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld4 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + ldr q2, [src0, # 2*16] + ldr q3, [src0, # 3*16] - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + ldr q18, [src1, # 2*16] + ldr q19, [src1, # 3*16] + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 mov v4.S[0], Q mov v20.D[0], RphRdp dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 - dq_butterfly_vec_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 - dq_butterfly_vec_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 - dq_butterfly_vec_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_top v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + mov counter, #7 + _intt_bot_loop: + + dq_butterfly_vec_top_ltrn_4x4 \ + v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + table0, \ + q8, q9, q10, q11, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16), \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q24, q25, q26, q27, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16) dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 - dq_butterfly_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + str q2, [src0, # 2*16] srshr v14.4S, v0.4S, #23 + ldr q2, [src0, #(64+ 2*16)] + str q3, [src0, # 3*16] srshr v15.4S, v1.4S, #23 + ldr q3, [src0, #(64+ 3*16)] + str q18, [src1, # 2*16] srshr v30.4S, v16.4S, #23 + ldr q18, [src1, #(64+ 2*16)] + str q19, [src1, # 3*16] srshr v31.4S, v17.4S, #23 + ldr q19, [src1, #(64+ 3*16)] mls v0.4S, v14.4S, v4.S[0] + str q0, [src0, # 0*16] + ldr q0, [src0, #(64+ 0*16)] mls v1.4S, v15.4S, v4.S[0] + str q1, [src0, # 1*16] + ldr q1, [src0, #(64+ 1*16)] mls v16.4S, v30.4S, v4.S[0] + str q16, [src1, # 0*16] + ldr q16, [src1, #(64+ 0*16)] mls v17.4S, v31.4S, v4.S[0] + str q17, [src1, # 1*16] + ldr q17, [src1, #(64+ 1*16)] + + add table0, table0, #128 + add table1, table1, #128 - st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st1 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 + + mov v4.S[0], Q + mov v20.D[0], RphRdp + + dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 sub counter, counter, #1 cbnz counter, _intt_bot_loop + dq_butterfly_vec_top_trn_4x4 \ + v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + + str q2, [src0, # 2*16] + str q3, [src0, # 3*16] + str q18, [src1, # 2*16] + str q19, [src1, # 3*16] + + srshr v14.4S, v0.4S, #23 + srshr v15.4S, v1.4S, #23 + srshr v30.4S, v16.4S, #23 + srshr v31.4S, v17.4S, #23 + + mls v0.4S, v14.4S, v4.S[0] + mls v1.4S, v15.4S, v4.S[0] + mls v16.4S, v30.4S, v4.S[0] + mls v17.4S, v31.4S, v4.S[0] + + str q0, [src0, # 0*16] + str q1, [src0, # 1*16] + str q16, [src1, # 0*16] + str q17, [src1, # 1*16] + + add table0, table0, #128 + add table1, table1, #128 + + add src0, src0, #64 + add src1, src1, #64 + .unreq Q .unreq RphRdp .unreq src0 - .unreq des0 .unreq src1 - .unreq des1 .unreq table0 .unreq table1 .unreq counter pop_all - br lr - - + ret diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_poly.S b/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_poly.S index 25b739452..a8c9568fb 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_poly.S +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,10 +32,10 @@ #include "params.h" .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32 -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32 -PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: -_PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32 +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32 +PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32: +_PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32: mov x7, #16 _10_to_32_loop: @@ -45,7 +48,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: str w4, [x0], #4 ubfx w5, w2, #20, #10 str w5, [x0], #4 - lsr w6, w2, #30 + lsr w6, w2, #30 ldr w2, [x1], #4 @@ -99,13 +102,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: sub x7, x7, #1 cbnz x7, _10_to_32_loop - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce: ldr w4, [x1] @@ -117,7 +120,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 - + ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 @@ -126,7 +129,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -135,7 +138,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -192,13 +195,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq: ldr w4, [x1] @@ -285,13 +288,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze: ldr w4, [x1] @@ -312,7 +315,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -321,7 +324,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S @@ -330,7 +333,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 - + sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 @@ -339,7 +342,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -414,13 +417,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round: mov w4, #1 @@ -560,13 +563,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: st1 {v30.4S}, [x1], #16 st1 {v31.4S}, [x1], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_add: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -609,13 +612,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_sub: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -632,7 +635,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: mov x16, #15 _poly_sub_loop: - + st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -658,13 +661,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_shiftl: add x1, x0, #0 @@ -725,13 +728,13 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: st1 {v22.4S}, [x0], #16 st1 {v23.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery -PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: -_PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery +PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery: +_PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery: push_all @@ -769,14 +772,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -819,14 +822,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -843,14 +846,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery -.global _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery -PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: -_PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: +.global PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery +.global _PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery +PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery: +_PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery: push_all @@ -910,90 +913,90 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1027,14 +1030,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v27.4S, v23.4S, v31.4S ld1 { v7.4S}, [x2], #16 - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1064,90 +1067,90 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1173,14 +1176,14 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1194,7 +1197,7 @@ _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: pop_all - br lr + ret diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/api.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/api.h index 635fe1fb2..46832a8ed 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/api.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/api.h @@ -12,8 +12,8 @@ #define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES 1952 #define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES 4032 -#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES 3309 -#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_ALGNAME "Dilithium3" +#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES 3309 +#define PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_ALGNAME "Dilithium3" int PQCLEAN_DILITHIUM3_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/feat.S b/Modules/PQClean/crypto_sign/dilithium3/aarch64/feat.S deleted file mode 100644 index 358adf614..000000000 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global PQCLEAN_DILITHIUM3_AARCH64_f1600x2 -.global _PQCLEAN_DILITHIUM3_AARCH64_f1600x2 -PQCLEAN_DILITHIUM3_AARCH64_f1600x2: -_PQCLEAN_DILITHIUM3_AARCH64_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/fips202x2.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/fips202x2.c deleted file mode 100644 index 6e3e5d450..000000000 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/fips202x2.c +++ /dev/null @@ -1,684 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We choose - * CC0 1.0 Universal or the following MIT License for this file. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_DILITHIUM3_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/fips202x2.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/fips202x2.h deleted file mode 100644 index 28babbc39..000000000 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/fips202x2.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include "params.h" -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -#define shake128x2_absorb DILITHIUM_NAMESPACE(shake128x2_absorb) -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake128x2_squeezeblocks DILITHIUM_NAMESPACE(shake128x2_squeezeblocks) -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake256x2_absorb DILITHIUM_NAMESPACE(shake256x2_absorb) -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2_squeezeblocks DILITHIUM_NAMESPACE(shake256x2_squeezeblocks) -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake128x2 DILITHIUM_NAMESPACE(shake128x2) -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2 DILITHIUM_NAMESPACE(shake256x2) -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); -#endif diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros.inc b/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros.inc index ef3af4c54..5504405c1 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros.inc +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,24 +30,254 @@ #include "macros_common.inc" -.macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - trn1 \t0\qS, \a0\qS, \a1\qS - trn2 \t1\qS, \a0\qS, \a1\qS - trn1 \t2\qS, \a2\qS, \a3\qS - trn2 \t3\qS, \a2\qS, \a3\qS + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S - trn1 \a0\dD, \t0\dD, \t2\dD - trn2 \a2\dD, \t0\dD, \t2\dD - trn1 \a1\dD, \t1\dD, \t3\dD - trn2 \a3\dD, \t1\dD, \t3\dD + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D .endm -.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - wrap_trn_4x4 \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, .4S, .2D + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + +.macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + +.macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 .endm +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + + +.macro do_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== .macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S @@ -54,12 +287,20 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 .endm -.macro dq_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S .endm @@ -67,16 +308,32 @@ wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + .macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 .endm -.macro dq_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm @@ -89,16 +346,48 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + .macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm -.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 .endm -.macro qq_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S .endm @@ -109,3 +398,5 @@ .macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S .endm + +// === 32-bit end ==== diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros_common.inc b/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros_common.inc index bd7e77eb9..07568491d 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros_common.inc +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,35 +28,58 @@ * SOFTWARE. */ +#ifndef MACROS_COMMON +#define MACROS_COMMON + // for ABI .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -72,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -82,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -99,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -135,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -149,7 +310,134 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t4\wX, \b4\wX, \z4\nX[\h4] @@ -176,7 +464,186 @@ .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] + mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] + mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] + mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -218,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -228,15 +769,82 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] sub \b1\wX, \a1\wX, \t1\wX mul \t3\wX, \b3\wX, \h3\wX + ldr \c2, [\srcc_ptr, \memc2] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b3\wX, \b3\wX, \l3\wX @@ -245,7 +853,7 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX @@ -262,57 +870,98 @@ .endm +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + // vector-scalar Barrett reduction .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -391,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX @@ -448,3 +1191,10 @@ add \s3\wX, \a3\wX, \b3\wX .endm + + +#endif + + + + diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.c index 2d88c5d5e..ec594a77c 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.c +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,12 +33,26 @@ */ #include "params.h" -#include "reduce.h" +#include "NTT_params.h" +#include "ntt.h" #include #include -#include "NTT_params.h" -#include "ntt.h" +const __attribute__ ((aligned (16)))int32_t constants[16] = { + Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, + invNQ1R2modQ1_prime_half, + invNQ1R2modQ1_doubleprime, + invNQ1_final_R2modQ1_prime_half, + invNQ1_final_R2modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { + 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { + 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 +}; /************************************************* * Name: ntt diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.h index 0fdd00404..ea3389017 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/ntt.h @@ -1,17 +1,19 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_NTT_H +#define PQCLEAN_DILITHIUM3_AARCH64_NTT_H /* * This file was originally licensed * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,45 +34,42 @@ * SOFTWARE. */ -#include "NTT_params.h" #include "params.h" +#include "NTT_params.h" #include -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants); +#define constants DILITHIUM_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define streamlined_GS_itable_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_GS_itable_Q1_jump_extended) + +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); + +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); + +extern +const int32_t constants[16]; -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants); +extern +const int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define NTT(in) { \ - PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM3_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define iNTT(in) { \ - PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM3_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +#define NTT(in) do { \ + PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM3_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_bot(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM3_AARCH64__asm_intt_SIMD_top(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + } while(0) #define ntt DILITHIUM_NAMESPACE(ntt) void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); -static const int constants[16] = { - Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, - invNQ1R2modQ1_prime_half, - invNQ1R2modQ1_doubleprime, - invNQ1_final_R2modQ1_prime_half, - invNQ1_final_R2modQ1_doubleprime -}; - -static const int streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 0, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 3073009, 327391679, 1277625, -675340520, -2635473, 987079667, 3852015, 449207, 1753, -495951789, -1935420, -681503850, -2659525, -373072124, -1455890, 681730119, 2660408, -456183549, -1780227, -15156688, -59148, 710479343, 2772600, 0, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, 4183372, -825844983, -3222807, -799869667, -3121440, -70227934, -274060, 302950022, 1182243, 22347069, 87208, 163212680, 636927, -1016110510, -3965306, -1013916752, -3956745, -588452222, -2296397, -841760171, -3284915, -952468207, -3716946, 0, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 2508980, 519705671, 2028118, 496502727, 1937570, -977780347, -3815725, -7126831, -27812, 210776307, 822541, 258649997, 1009365, -628875181, -2454145, -507246529, -1979497, 409185979, 1596822, -1013967746, -3956944, -963363710, -3759465, 0, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, 2811291, -764594519, -2983781, -284313712, -1109516, 1065510939, 4158088, -431820817, -1685153, -873958779, -3410568, 686309310, 2678278, -965793731, -3768948, -909946047, -3551006, 162963861, 635956, -64176841, -250446, -629190881, -2455377, 0, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 1528066, 123678909, 482649, 294395108, 1148858, -759080783, -2962264, -1062481036, -4146264, -454226054, -1772588, 561940831, 2192938, -442566669, -1727088, 611800717, 2387513, -925511710, -3611750, -68791907, -268456, -814992530, -3180456, 0, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, -565603, 43482586, 169688, 631001801, 2462444, -854436357, -3334383, 960233614, 3747250, 588375860, 2296099, 317727459, 1239911, -983611064, -3838479, 818892658, 3195676, 677264190, 2642980, 321386456, 1254190, -3181859, -12417, 0, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -4166425, -893898890, -3488383, 509377762, 1987814, -819295484, -3197248, 768294260, 2998219, 36345249, 141835, -22883400, -89301, 643961400, 2513018, -347191365, -1354892, 157142369, 613238, -335754661, -1310261, -568482643, -2218467, 0, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 1736313, 60323094, 235407, -832852657, -3250154, 834980303, 3258457, -117552223, -458740, -492511373, -1921994, 1035301089, 4040196, -889718424, -3472069, 522531086, 2039144, -481719139, -1879878, -209807681, -818761, -558360247, -2178965, 0, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, -2579253, 458160776, 1787943, -612717067, -2391089, -577774276, -2254727, -415984810, -1623354, 539479988, 2105286, -608441020, -2374402, -521163479, -2033807, 150224382, 586241, -302276083, -1179613, 135295244, 527981, -702999655, -2743411, 0, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, 3482206, -1071872863, -4182915, -333129378, -1300016, -605279149, -2362063, -378477722, -1476985, 510974714, 1994046, 638402564, 2491325, -356997292, -1393159, 130156402, 507927, -304395785, -1187885, -185731180, -724804, -470097680, -1834526, 0, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, -1317678, 630730945, 2461387, 777970524, 3035980, 159173408, 621164, -777397036, -3033742, -86720197, -338420, 678549029, 2647994, 771248568, 3009748, -669544140, -2612853, 1063046068, 4148469, 192079267, 749577, -1030830548, -4022750, 0, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, 3901472, -314332144, -1226661, 749740976, 2925816, 864652284, 3374250, 1020029345, 3980599, 658309618, 2569011, -413979908, -1615530, 441577800, 1723229, 426738094, 1665318, 519685171, 2028038, 298172236, 1163598, -863376927, -3369273, 0, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, 1356448, -711287812, -2775755, 687588511, 2683270, -712065019, -2778788, 1023635298, 3994671, -3043996, -11879, -351195274, -1370517, 773976352, 3020393, 861908357, 3363542, 55063046, 214880, 139752717, 545376, -197425671, -770441, 0, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, -3467665, 592665232, 2312838, -167401858, -653275, -117660617, -459163, 795799901, 3105558, -282732136, -1103344, 130212265, 508145, -141890356, -553718, 220412084, 860144, 879049958, 3430436, 35937555, 140244, -388001774, -1514152, 0, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, 348812, -84011120, -327848, 259126110, 1011223, -603268097, -2354215, -559928242, -2185084, 800464680, 3123762, 604333585, 2358373, -561979013, -2193087, -772445769, -3014420, -439933955, -1716814, 749801963, 2926054, -100631253, -392707, 0, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -3818627, -492577742, -1922253, -573161516, -2236726, 447030292, 1744507, -77645096, -303005, 904878186, 3531229, -1018462631, -3974485, -967019376, -3773731, 486888731, 1900052, -200355636, -781875, 270210213, 1054478, -187430119, -731434, 0, 0 -}; - -static const int streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 0, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, -1744507, 573161516, 2236726, 492577742, 1922253, 978523985, 3818627, 187430119, 731434, -270210213, -1054478, 200355636, 781875, -486888731, -1900052, 967019376, 3773731, 1018462631, 3974485, -904878186, -3531229, 77645096, 303005, 0, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, 2354215, -259126110, -1011223, 84011120, 327848, -89383150, -348812, 100631253, 392707, -749801963, -2926054, 439933955, 1716814, 772445769, 3014420, 561979013, 2193087, -604333585, -2358373, -800464680, -3123762, 559928242, 2185084, 0, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 459163, 167401858, 653275, -592665232, -2312838, 888589898, 3467665, 388001774, 1514152, -35937555, -140244, -879049958, -3430436, -220412084, -860144, 141890356, 553718, -130212265, -508145, 282732136, 1103344, -795799901, -3105558, 0, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, 2778788, -687588511, -2683270, 711287812, 2775755, -347590090, -1356448, 197425671, 770441, -139752717, -545376, -55063046, -214880, -861908357, -3363542, -773976352, -3020393, 351195274, 1370517, 3043996, 11879, -1023635298, -3994671, 0, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -3374250, -749740976, -2925816, 314332144, 1226661, -999753034, -3901472, 863376927, 3369273, -298172236, -1163598, -519685171, -2028038, -426738094, -1665318, -441577800, -1723229, 413979908, 1615530, -658309618, -2569011, -1020029345, -3980599, 0, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -621164, -777970524, -3035980, -630730945, -2461387, 337655269, 1317678, 1030830548, 4022750, -192079267, -749577, -1063046068, -4148469, 669544140, 2612853, -771248568, -3009748, -678549029, -2647994, 86720197, 338420, 777397036, 3033742, 0, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 2362063, 333129378, 1300016, 1071872863, 4182915, -892316032, -3482206, 470097680, 1834526, 185731180, 724804, 304395785, 1187885, -130156402, -507927, 356997292, 1393159, -638402564, -2491325, -510974714, -1994046, 378477722, 1476985, 0, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 2254727, 612717067, 2391089, -458160776, -1787943, 660934133, 2579253, 702999655, 2743411, -135295244, -527981, 302276083, 1179613, -150224382, -586241, 521163479, 2033807, 608441020, 2374402, -539479988, -2105286, 415984810, 1623354, 0, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, -3258457, 832852657, 3250154, -60323094, -235407, -444930577, -1736313, 558360247, 2178965, 209807681, 818761, 481719139, 1879878, -522531086, -2039144, 889718424, 3472069, -1035301089, -4040196, 492511373, 1921994, 117552223, 458740, 0, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, 3197248, -509377762, -1987814, 893898890, 3488383, 1067647297, 4166425, 568482643, 2218467, 335754661, 1310261, -157142369, -613238, 347191365, 1354892, -643961400, -2513018, 22883400, 89301, -36345249, -141835, -768294260, -2998219, 0, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, 3334383, -631001801, -2462444, -43482586, -169688, 144935890, 565603, 3181859, 12417, -321386456, -1254190, -677264190, -2642980, -818892658, -3195676, 983611064, 3838479, -317727459, -1239911, -588375860, -2296099, -960233614, -3747250, 0, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, 2962264, -294395108, -1148858, -123678909, -482649, -391567239, -1528066, 814992530, 3180456, 68791907, 268456, 925511710, 3611750, -611800717, -2387513, 442566669, 1727088, -561940831, -2192938, 454226054, 1772588, 1062481036, 4146264, 0, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, -4158088, 284313712, 1109516, 764594519, 2983781, -720393920, -2811291, 629190881, 2455377, 64176841, 250446, -162963861, -635956, 909946047, 3551006, 965793731, 3768948, -686309310, -2678278, 873958779, 3410568, 431820817, 1685153, 0, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, 3815725, -496502727, -1937570, -519705671, -2028118, -642926661, -2508980, 963363710, 3759465, 1013967746, 3956944, -409185979, -1596822, 507246529, 1979497, 628875181, 2454145, -258649997, -1009365, -210776307, -822541, 7126831, 27812, 0, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 274060, 799869667, 3121440, 825844983, 3222807, -1071989969, -4183372, 952468207, 3716946, 841760171, 3284915, 588452222, 2296397, 1013916752, 3956745, 1016110510, 3965306, -163212680, -636927, -22347069, -87208, -302950022, -1182243, 0, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, -3852015, 675340520, 2635473, -327391679, -1277625, -787459213, -3073009, -710479343, -2772600, 15156688, 59148, 456183549, 1780227, -681730119, -2660408, 373072124, 1455890, 681503850, 2659525, 495951789, 1935420, -449207, -1753, 0, 0 -}; - #endif diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.c index 8fa3b0ccb..779976ec4 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.c +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.c @@ -19,7 +19,7 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1) { unsigned int i; @@ -45,7 +45,7 @@ void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], **************************************************/ void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) { + const uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -71,7 +71,7 @@ void unpack_pk(uint8_t rho[SEEDBYTES], * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 **************************************************/ -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -129,7 +129,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]) { + const uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -172,7 +172,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h **************************************************/ -void pack_sig(uint8_t sig[CRYPTO_BYTES], +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h) { @@ -221,7 +221,7 @@ void pack_sig(uint8_t sig[CRYPTO_BYTES], int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, - const uint8_t sig[CRYPTO_BYTES]) { + const uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES]) { unsigned int i, j, k; for (i = 0; i < CTILDEBYTES; ++i) { diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.h index fb70ce5db..de6083ce2 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/packing.h @@ -1,5 +1,5 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_PACKING_H +#define PQCLEAN_DILITHIUM3_AARCH64_PACKING_H /* * This file is dual licensed @@ -7,15 +7,16 @@ * or public domain at https://github.com/pq-crystals/dilithium */ +#include "api.h" #include "params.h" #include "polyvec.h" #include #define pack_pk DILITHIUM_NAMESPACE(pack_pk) -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); #define pack_sk DILITHIUM_NAMESPACE(pack_sk) -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -24,10 +25,10 @@ void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], const polyveck *s2); #define pack_sig DILITHIUM_NAMESPACE(pack_sig) -void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); #define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) -void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES]); #define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) void unpack_sk(uint8_t rho[SEEDBYTES], @@ -36,9 +37,9 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + const uint8_t sk[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_SECRETKEYBYTES]); #define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) -int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES]); #endif diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/params.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/params.h index 922c44d0f..b5fe91d24 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/params.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_PARAMS_H +#define PQCLEAN_DILITHIUM3_AARCH64_PARAMS_H /* * This file is dual licensed @@ -9,10 +9,10 @@ // #define DILITHIUM_MODE 2 #define DILITHIUM_MODE 3 -//#define DILITHIUM_MODE 5 +// #define DILITHIUM_MODE 5 -#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM3_AARCH64_crypto_sign #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM3_AARCH64_##s +#define CRYPTO_NAMESPACETOP crypto_sign #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s) @@ -40,18 +40,20 @@ #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) +// GAMMA1 == (1 << 19) #define POLYZ_PACKEDBYTES 640 +// GAMMA2 == (DILITHIUM_Q-1)/32 #define POLYW1_PACKEDBYTES 128 #define POLYETA_PACKEDBYTES 128 -#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) -#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ - + TRBYTES \ - + L*POLYETA_PACKEDBYTES \ - + K*POLYETA_PACKEDBYTES \ - + K*POLYT0_PACKEDBYTES) -#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) +#define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define DILITHIUM_CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.c index 1832b641b..554c50629 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.c +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -35,16 +37,9 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" -#include - -#include "fips202x2.h" - -#include "NTT_params.h" +#include "keccak2x/fips202x2.h" #include "ntt.h" - -static const int32_t montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; +#include #define DBENCH_START() #define DBENCH_STOP(t) @@ -57,11 +52,11 @@ static const int32_t montgomery_const[4] = { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(int32_t *, const int32_t *); void poly_reduce(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_reduce(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -74,11 +69,11 @@ void poly_reduce(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(int32_t *, const int32_t *); void poly_caddq(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_caddq(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -91,11 +86,11 @@ void poly_caddq(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(int32_t *, const int32_t *); void poly_freeze(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_freeze(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -205,11 +200,11 @@ void poly_invntt_tomont(poly *a) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, constants); DBENCH_STOP(*tmul); } @@ -226,11 +221,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { * - poly *a0: pointer to output polynomial with coefficients c0 * - const poly *a: pointer to input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *); void poly_power2round(poly *a1, poly *a0, const poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); + PQCLEAN_DILITHIUM3_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); DBENCH_STOP(*tround); } @@ -706,11 +701,11 @@ void polyt1_pack(uint8_t *r, const poly *a) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32(int32_t *, const uint8_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(int32_t *, const uint8_t *); void polyt1_unpack(poly *r, const uint8_t *a) { DBENCH_START(); - PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32(r->coeffs, a); + PQCLEAN_DILITHIUM3_AARCH64__asm_10_to_32(r->coeffs, a); DBENCH_STOP(*tpack); } diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.h index c253ecf69..cad1723e4 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_POLY_H +#define PQCLEAN_DILITHIUM3_AARCH64_POLY_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.c index d8d9d2b98..1a5d6aa92 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.c +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,13 +35,9 @@ #include "params.h" #include "poly.h" #include "polyvec.h" -#include - +#include "ntt.h" #include "reduce.h" - -static const int32_t l_montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; +#include /************************************************* * Name: expand_mat @@ -177,11 +175,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -extern void PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { - PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const); + PQCLEAN_DILITHIUM3_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, constants); } /************************************************* diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.h index dc3377c93..ad4e36ab0 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_POLYVEC_H +#define PQCLEAN_DILITHIUM3_AARCH64_POLYVEC_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/reduce.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/reduce.h index 9042e6cb0..1abb92a4f 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/reduce.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_REDUCE_H +#define PQCLEAN_DILITHIUM3_AARCH64_REDUCE_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.c index 871c97595..c01432776 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.c +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.c @@ -95,7 +95,8 @@ int32_t use_hint(int32_t a, unsigned int hint) { if (a0 > 0) { return (a1 + 1) & 15; + } else { + return (a1 - 1) & 15; } - return (a1 - 1) & 15; } diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.h index 36167d2af..f142c0e35 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_ROUNDING_H +#define PQCLEAN_DILITHIUM3_AARCH64_ROUNDING_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.c index 3565b3704..86c958b44 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.c +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -90,7 +91,7 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { pack_pk(pk, rho, &t1); /* Compute H(rho, t1) and write secret key */ - shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(tr, TRBYTES, pk, PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES); pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; @@ -210,7 +211,7 @@ int crypto_sign_signature(uint8_t *sig, /* Write signature */ pack_sig(sig, sig, &z, &h); - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES; return 0; } @@ -238,9 +239,9 @@ int crypto_sign(uint8_t *sm, size_t i; for (i = 0; i < mlen; ++i) { - sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + sm[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; } - crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES, mlen, sk); *smlen += mlen; return 0; } @@ -274,7 +275,7 @@ int crypto_sign_verify(const uint8_t *sig, polyveck t1, w1, h; shake256incctx state; - if (siglen != CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES) { return -1; } @@ -287,7 +288,7 @@ int crypto_sign_verify(const uint8_t *sig, } /* Compute CRH(H(rho, t1), msg) */ - shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(mu, CRHBYTES, pk, PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_PUBLICKEYBYTES); shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -353,17 +354,17 @@ int crypto_sign_open(uint8_t *m, const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; - if (crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES; + if (crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM3_AARCH64_CRYPTO_BYTES + i]; } return 0; } diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.h index bc8c42658..0759909cc 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/sign.h @@ -1,5 +1,5 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_SIGN_H +#define PQCLEAN_DILITHIUM3_AARCH64_SIGN_H /* * This file is dual licensed @@ -24,7 +24,7 @@ int crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); -#define crypto_sign DILITHIUM_NAMESPACETOP +#define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk); diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric-shake.c b/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric-shake.c index a53074aac..53aab1c94 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric-shake.c +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric-shake.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric.h b/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric.h index 40b928ec6..d9551aba1 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric.h +++ b/Modules/PQClean/crypto_sign/dilithium3/aarch64/symmetric.h @@ -1,13 +1,14 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM3_AARCH64_SYMMETRIC_H +#define PQCLEAN_DILITHIUM3_AARCH64_SYMMETRIC_H /* * This file was originally licensed * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -33,7 +34,7 @@ */ #include "fips202.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "params.h" #include diff --git a/Modules/PQClean/crypto_sign/dilithium3/clean/sign.c b/Modules/PQClean/crypto_sign/dilithium3/clean/sign.c index 7a4f3d315..0f13be413 100644 --- a/Modules/PQClean/crypto_sign/dilithium3/clean/sign.c +++ b/Modules/PQClean/crypto_sign/dilithium3/clean/sign.c @@ -337,7 +337,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, badsig: /* Signature verification failed */ - *mlen = -1; + *mlen = (size_t) -1; for (i = 0; i < smlen; ++i) { m[i] = 0; } diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/LICENSE b/Modules/PQClean/crypto_sign/dilithium5/aarch64/LICENSE index 0e259d42c..093b0a7db 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/LICENSE +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/LICENSE @@ -1,121 +1,6 @@ -Creative Commons Legal Code -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. +All the files in this repository are covered by a variety of licenses. +In principle, we offer two choices of licensing: +MIT (https://opensource.org/license/mit/) or CC0 1.0 Universal (https://creativecommons.org/publicdomain/zero/1.0/legalcode.en). +You can find the detailed licensing information in the beginning of each files. +If nothing is stated in the beginning of a file, the file is covered by CC0 1.0 Universal. diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/Makefile b/Modules/PQClean/crypto_sign/dilithium5/aarch64/Makefile index 8a156266d..a52fece1a 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/Makefile +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/Makefile @@ -1,12 +1,16 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium5_aarch64.a -HEADERS=api.h fips202x2.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h -OBJECTS=fips202x2.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o feat.o +HEADERS=api.h macros_common.inc macros.inc NTT_params.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS= ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o __asm_iNTT.o __asm_NTT.o __asm_poly.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) -g +KECCAK2XDIR=../../../common/keccak2x +KECCAK2XOBJ=fips202x2.o feat.o +KECCAK2X=$(addprefix $(KECCAK2XDIR)/,$(KECCAK2XOBJ)) + all: $(LIB) %.o: %.c $(HEADERS) @@ -15,8 +19,11 @@ all: $(LIB) %.o: %.S $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -$(LIB): $(OBJECTS) $(HEADERS) - $(AR) -r $@ $(OBJECTS) +$(LIB): $(OBJECTS) $(KECCAK2X) + $(AR) -r $@ $(OBJECTS) $(KECCAK2X) + +$(KECCAK2X): + $(MAKE) -C $(KECCAK2XDIR) CFLAGS="$(CFLAGS)" $(KECCAK2XOBJ) clean: $(RM) $(OBJECTS) diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/NTT_params.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/NTT_params.h index 582c16ed5..b087f7814 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/NTT_params.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/NTT_params.h @@ -1,8 +1,10 @@ -#ifndef NTT_PARAMS_H -#define NTT_PARAMS_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_NTT_PARAMS_H +#define PQCLEAN_DILITHIUM5_AARCH64_NTT_PARAMS_H /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_NTT.S b/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_NTT.S index c1d25f64b..be5a97f57 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_NTT.S +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_NTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,261 +30,413 @@ #include "macros.inc" -.align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top -PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top: -_PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top: - - push_all - Q .req w20 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 - counter .req x19 - - ldr Q, [x2] - - mov table, x1 - - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 - - add src11, src0, #704 - add src12, src0, #768 +#include "params.h" - add src13, src0, #832 - add src14, src0, #896 +.align 2 +.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top +PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top: +_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top: - add src15, src0, #960 + push_simd + Q .req w8 + src .req x0 + counter .req x11 - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x1], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [x1], #64 + ldr Q, [x2] mov v20.S[0], Q - ld1 { v1.4S}, [ src1] - ld1 { v3.4S}, [ src3] - ld1 { v5.4S}, [ src5] - ld1 { v7.4S}, [ src7] - ld1 { v9.4S}, [ src9] - ld1 {v11.4S}, [src11] - ld1 {v13.4S}, [src13] - ld1 {v15.4S}, [src15] - - ld1 { v0.4S}, [ src0] - ld1 { v2.4S}, [ src2] - ld1 { v4.4S}, [ src4] - ld1 { v6.4S}, [ src6] - ld1 { v8.4S}, [ src8] - ld1 {v10.4S}, [src10] - ld1 {v12.4S}, [src12] - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + ldr q9, [src, #9*64] + ldr q11, [src, #11*64] + ldr q13, [src, #13*64] + ldr q15, [src, #15*64] + + qq_butterfly_topl \ + v9, v11, v13, v15, v16, v17, v18, v19, v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64 + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) mov counter, #3 _ntt_top_loop: - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - - qq_butterfly_top v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 - qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 - qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mixssl \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + v9, v11, v13, v15, v16, v17, v18, v19, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64, \ + src, \ + q1, q3, q5, q7, \ + #(16+1*64), #(16+3*64), #(16+5*64), #(16+7*64) + + qq_butterfly_mixll \ + v1, v3, v5, v7, v9, v11, v13, v15, v16, v17, v18, v19, \ + v8, v10, v12, v14, v28, v29, v30, v31, \ + v20, \ + v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, \ + src, \ + q8, q10, q12, q14, \ + #(16+8*64), #(16+10*64), #(16+12*64), #(16+14*64), \ + src, \ + q0, q2, q4, q6, \ + #(16+0*64), #(16+2*64), #(16+4*64), #(16+6*64) + + add src, src, #16 + + qq_butterfly_mix v0, v2, v4, v6, v8, v10, v12, v14, v28, v29, v30, v31, v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v1, v3, v9, v11, v5, v7, v13, v15, v16, v17, v18, v19, v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix v0, v2, v8, v10, v4, v6, v12, v14, v28, v29, v30, v31, v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v1, v5, v9, v13, v3, v7, v11, v15, v16, v17, v18, v19, v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mix v0, v4, v8, v12, v2, v6, v10, v14, v28, v29, v30, v31, v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + qq_butterfly_mixssl \ + v8, v10, v12, v14, v9, v11, v13, v15, v16, v17, v18, v19, \ + v1, v3, v5, v7, v28, v29, v30, v31, \ + v20, \ + v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, \ + src, \ + q9, q11, q13, q15, \ + #9*64, #11*64, #13*64, #15*64, \ + src, \ + q8, q10, q12, q14, \ + #8*64, #10*64, #12*64, #14*64, \ + src, \ + q9, q11, q13, q15, \ + #(16+9*64), #(16+11*64), #(16+13*64), #(16+15*64) sub counter, counter, #1 cbnz counter, _ntt_top_loop - st1 { v1.4S}, [ src1], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v7.4S}, [ src7], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v11.4S}, [src11], #16 - st1 {v13.4S}, [src13], #16 - st1 {v15.4S}, [src15], #16 - - st1 { v0.4S}, [ src0], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v8.4S}, [ src8], #16 - st1 {v10.4S}, [src10], #16 - st1 {v12.4S}, [src12], #16 - st1 {v14.4S}, [src14], #16 + qq_butterfly_botss \ + v0, v2, v4, v6, v1, v3, v5, v7, v28, v29, v30, v31, \ + src, \ + q1, q3, q5, q7, \ + #1*64, #3*64, #5*64, #7*64, \ + src, \ + q0, q2, q4, q6, \ + #0*64, #2*64, #4*64, #6*64 .unreq Q - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter - pop_all + pop_simd - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot -PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot: -_PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot: - - push_all - Q .req w20 - src0 .req x0 - des0 .req x1 - src1 .req x2 - des1 .req x3 - table0 .req x28 - table1 .req x27 - counter .req x19 +.global PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot +PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot: +_PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot: + + push_simd + Q .req w8 + src .req x0 + table0 .req x9 + table1 .req x10 + counter .req x11 ldr Q, [x2] add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + ldr q0, [src, #0*16] + ldr q1, [src, #1*16] + ldr q2, [src, #2*16] + ldr q3, [src, #3*16] + + ldr q4, [table0, #0*16] + ldr q5, [table0, #1*16] + ldr q20, [table1, #0*16] + ldr q21, [table1, #1*16] + + dq_butterfly_topl4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 + + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + - add des0, src0, #0 - add des1, src0, #512 + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - mov counter, #8 + str q0, [src, #0*16] + str q2, [src, #2*16] + + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 + + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + mov counter, #3 _ntt_bot_loop: - ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld1 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + trn_4x4_l4 v12, v13, v14, v15, v8, v9, v10, v11, src, q0, q1, q2, q3, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + str q12, [src, #0*16] + str q13, [src, #1*16] - mov v4.S[0], Q + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - dq_butterfly_top v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 - dq_butterfly_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + str q14, [src, #2*16] + str q15, [src, #3*16] + + + add src, src, #64 + + trn_4x4_l4 v28, v29, v30, v31, v24, v25, v26, v27, src, q16, q17, q18, q19, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v0, v1, v2, v3, v12, v13, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q28, q29, q30, q31, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 dq_butterfly_bot v16, v18, v17, v19, v28, v29, v4, v21, 0, 1, v21, 2, 3 + add table0, table0, #128 + add table1, table1, #128 + trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 - dq_butterfly_vec_top v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7 - dq_butterfly_vec_mixed v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_mixed v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 - dq_butterfly_vec_mixed v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_top_trn_4x4 \ + v0, v1, v2, v3, v12, v13, v4, v6, v7, v6, v7, \ + v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_vec_mix v0, v1, v2, v3, v12, v13, v16, v17, v18, v19, v28, v29, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v16, v17, v18, v19, v28, v29, v0, v2, v1, v3, v12, v13, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v0, v2, v1, v3, v12, v13, v16, v18, v17, v19, v28, v29, v4, v8, v9, v10, v11, v24, v25, v26, v27 + + + trn_4x4_l4 v0, v1, v2, v3, v8, v9, v10, v11, src, q12, q13, q14, q15, #(64+0*16), #(64+1*16), #(64+2*16), #(64+3*16) + + str q0, [src, #0*16] + str q2, [src, #2*16] + dq_butterfly_vec_bot v16, v18, v17, v19, v28, v29, v4, v24, v25, v26, v27 - st4 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st4 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + str q1, [src, #1*16] + str q3, [src, #3*16] + + add src, src, #64 + + trn_4x4_l4 v16, v17, v18, v19, v24, v25, v26, v27, src, q28, q29, q30, q31, #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + sub src, src, #64 + + dq_butterfly_top2l4s4 \ + v12, v13, v14, v15, v0, v1, v4, v4, 2, 3, v4, 2, 3, \ + table0, q4, q5, #0*16, #1*16, \ + table1, q20, q21, #0*16, #1*16, \ + src, \ + q16, q17, q18, q19, \ + #(512+0*16), #(512+1*16), #(512+2*16), #(512+3*16) + + add src, src, #64 + + dq_butterfly_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + + dq_butterfly_mixl6 \ + v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, \ + v4, \ + v20, 2, 3, v20, 2, 3, v5, 0, 1, v5, 2, 3, \ + table0, \ + q6, q7, q8, q9, q10, q11, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_mixl6 \ + v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, \ + v4, \ + v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3, \ + table1, \ + q22, q23, q24, q25, q26, q27, \ + #2*16, #3*16, #4*16, #5*16, #6*16, #7*16 + + dq_butterfly_bot v28, v30, v29, v31, v16, v17, v4, v21, 0, 1, v21, 2, 3 + + add table0, table0, #128 + add table1, table1, #128 + + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + + dq_butterfly_vec_top_trn_4x4 \ + v12, v13, v14, v15, v0, v1, v4, v6, v7, v6, v7, \ + v28, v29, v30, v31, v16, v17, v18, v19 + + dq_butterfly_vec_mix v12, v13, v14, v15, v0, v1, v28, v29, v30, v31, v16, v17, v4, v6, v7, v6, v7, v22, v23, v22, v23 + dq_butterfly_vec_mix v28, v29, v30, v31, v16, v17, v12, v14, v13, v15, v0, v1, v4, v22, v23, v22, v23, v8, v9, v10, v11 + dq_butterfly_vec_mix v12, v14, v13, v15, v0, v1, v28, v30, v29, v31, v16, v17, v4, v8, v9, v10, v11, v24, v25, v26, v27 sub counter, counter, #1 cbnz counter, _ntt_bot_loop - .unreq Q - .unreq src0 - .unreq des0 - .unreq src1 - .unreq des1 - .unreq table0 - .unreq table1 - .unreq counter - pop_all + dq_butterfly_vec_bot v28, v30, v29, v31, v16, v17, v4, v24, v25, v26, v27 - br lr + trn_4x4 v12, v13, v14, v15, v0, v1, v2, v3 + trn_4x4_s4 v28, v29, v30, v31, v16, v17, v18, v19, src, q12, q13, q14, q15, #0*16, #1*16, #2*16, #3*16 + str q28, [src, #(512+0*16)] + str q29, [src, #(512+1*16)] + str q30, [src, #(512+2*16)] + str q31, [src, #(512+3*16)] + add src, src, #64 + .unreq Q + .unreq src + .unreq table0 + .unreq table1 + .unreq counter + pop_simd + ret diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_iNTT.S b/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_iNTT.S index a8191f5cf..559d442af 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_iNTT.S +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_iNTT.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,10 +31,10 @@ #include "macros.inc" .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top -PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: -_PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top +PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top: +_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top: push_all Q .req w20 @@ -41,23 +44,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: invNR2dp .req w25 invNWR2ph .req w26 invNWR2dp .req w27 - src0 .req x0 - src1 .req x1 - src2 .req x2 - src3 .req x3 - src4 .req x4 - src5 .req x5 - src6 .req x6 - src7 .req x7 - src8 .req x8 - src9 .req x9 - src10 .req x10 - src11 .req x11 - src12 .req x12 - src13 .req x13 - src14 .req x14 - src15 .req x15 - table .req x28 + src .req x0 counter .req x19 ldr Q, [x2, #0] @@ -69,77 +56,63 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: ldr invNWR2ph, [x2, #24] ldr invNWR2dp, [x2, #28] - mov table, x1 + ldr q20, [x1, #0*16] + ldr q21, [x1, #1*16] + ldr q22, [x1, #2*16] + ldr q23, [x1, #3*16] + ldr q24, [x1, #4*16] + ldr q25, [x1, #5*16] + ldr q26, [x1, #6*16] + ldr q27, [x1, #7*16] - add src1, src0, #64 - add src2, src0, #128 - - add src3, src0, #192 - add src4, src0, #256 - - add src5, src0, #320 - add src6, src0, #384 - - add src7, src0, #448 - add src8, src0, #512 - - add src9, src0, #576 - add src10, src0, #640 + mov v20.S[0], Q - add src11, src0, #704 - add src12, src0, #768 + ldr q0, [src, # 0*64] + ldr q1, [src, # 1*64] - add src13, src0, #832 - add src14, src0, #896 + ldr q2, [src, # 2*64] + ldr q3, [src, # 3*64] - add src15, src0, #960 + ldr q4, [src, # 4*64] + ldr q5, [src, # 5*64] - ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 - ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table], #64 + ldr q6, [src, # 6*64] + ldr q7, [src, # 7*64] - mov v20.S[0], Q + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 - ld1 { v0.4S}, [ src0] - ld1 { v1.4S}, [ src1] - ld1 { v2.4S}, [ src2] - ld1 { v3.4S}, [ src3] - ld1 { v4.4S}, [ src4] - ld1 { v5.4S}, [ src5] - ld1 { v6.4S}, [ src6] - ld1 { v7.4S}, [ src7] - - ld1 { v8.4S}, [ src8] - ld1 { v9.4S}, [ src9] - ld1 {v10.4S}, [src10] - ld1 {v11.4S}, [src11] - ld1 {v12.4S}, [src12] - ld1 {v13.4S}, [src13] - ld1 {v14.4S}, [src14] - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + mov counter, #3 + _intt_top_loop: + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -153,77 +126,99 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: sub v17.4S, v17.4S, v19.4S mla v0.4S, v16.4S, v29.4S - mla v1.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v2.4S + mla v1.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v3.4S + + str q0, [src, #0*64] cmge v16.4S, v2.4S, v30.4S + ldr q0, [src, #(16 + 0*64)] + str q1, [src, #1*64] cmge v17.4S, v3.4S, v30.4S + ldr q1, [src, #(16 + 1*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v2.4S, v16.4S, v29.4S - mla v3.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v4.4S + mla v3.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v5.4S + + str q2, [src, #2*64] cmge v16.4S, v4.4S, v30.4S + ldr q2, [src, #(16 + 2*64)] + str q3, [src, #3*64] cmge v17.4S, v5.4S, v30.4S + ldr q3, [src, #(16 + 3*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v4.4S, v16.4S, v29.4S - mla v5.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v6.4S + mla v5.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v7.4S + + str q4, [src, #4*64] cmge v16.4S, v6.4S, v30.4S + ldr q4, [src, #(16 + 4*64)] + str q5, [src, #5*64] cmge v17.4S, v7.4S, v30.4S + ldr q5, [src, #(16 + 5*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v6.4S, v16.4S, v29.4S - mla v7.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v8.4S + mla v7.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v9.4S + + str q6, [src, #6*64] cmge v16.4S, v8.4S, v30.4S + ldr q6, [src, #(16 + 6*64)] + str q7, [src, #7*64] cmge v17.4S, v9.4S, v30.4S + ldr q7, [src, #(16 + 7*64)] sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v8.4S, v16.4S, v29.4S - mla v9.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v10.4S + mla v9.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v11.4S + + str q8, [src, #8*64] cmge v16.4S, v10.4S, v30.4S + str q9, [src, #9*64] cmge v17.4S, v11.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v10.4S, v16.4S, v29.4S - mla v11.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v12.4S + mla v11.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v13.4S + + str q10, [src, #10*64] cmge v16.4S, v12.4S, v30.4S + str q11, [src, #11*64] cmge v17.4S, v13.4S, v30.4S sub v16.4S, v16.4S, v18.4S sub v17.4S, v17.4S, v19.4S mla v12.4S, v16.4S, v29.4S - mla v13.4S, v17.4S, v29.4S - cmge v18.4S, v31.4S, v14.4S + mla v13.4S, v17.4S, v29.4S cmge v19.4S, v31.4S, v15.4S + + str q12, [src, #12*64] cmge v16.4S, v14.4S, v30.4S + str q13, [src, #13*64] cmge v17.4S, v15.4S, v30.4S sub v16.4S, v16.4S, v18.4S @@ -232,66 +227,45 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - mov counter, #3 - _intt_top_loop: - - st1 { v0.4S}, [ src0], #16 - ld1 { v0.4S}, [ src0] - st1 { v1.4S}, [ src1], #16 - ld1 { v1.4S}, [ src1] - st1 { v2.4S}, [ src2], #16 - ld1 { v2.4S}, [ src2] - st1 { v3.4S}, [ src3], #16 - ld1 { v3.4S}, [ src3] - st1 { v4.4S}, [ src4], #16 - ld1 { v4.4S}, [ src4] - st1 { v5.4S}, [ src5], #16 - ld1 { v5.4S}, [ src5] - st1 { v6.4S}, [ src6], #16 - ld1 { v6.4S}, [ src6] - st1 { v7.4S}, [ src7], #16 - ld1 { v7.4S}, [ src7] - - st1 { v8.4S}, [ src8], #16 - ld1 { v8.4S}, [ src8] - st1 { v9.4S}, [ src9], #16 - ld1 { v9.4S}, [ src9] - st1 {v10.4S}, [src10], #16 - ld1 {v10.4S}, [src10] - st1 {v11.4S}, [src11], #16 - ld1 {v11.4S}, [src11] - st1 {v12.4S}, [src12], #16 - ld1 {v12.4S}, [src12] - st1 {v13.4S}, [src13], #16 - ld1 {v13.4S}, [src13] - st1 {v14.4S}, [src14], #16 - ld1 {v14.4S}, [src14] - st1 {v15.4S}, [src15], #16 - ld1 {v15.4S}, [src15] - - qq_butterfly_bot v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3 - qq_butterfly_mixed_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 - qq_butterfly_mixed_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 - qq_butterfly_mixed_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 - qq_butterfly_mixed_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 - qq_butterfly_mixed_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 + + qq_butterfly_botll \ + v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, \ + src, \ + q8, q9, q10, q11, \ + #8*64, #9*64, #10*64, #11*64, \ + src, \ + q12, q13, q14, q15, \ + #12*64, #13*64, #14*64, #15*64 + + qq_butterfly_mix_rev v0, v2, v4, v6, v16, v17, v18, v19, v1, v3, v5, v7, v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v20, v24, 0, 1, v24, 2, 3, v25, 0, 1, v25, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_mix_rev v8, v10, v12, v14, v28, v29, v30, v31, v9, v11, v13, v15, v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3 + qq_butterfly_mix_rev v0, v1, v4, v5, v16, v17, v18, v19, v2, v3, v6, v7, v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v20, v22, 0, 1, v22, 0, 1, v22, 2, 3, v22, 2, 3, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3 + qq_butterfly_mix_rev v8, v9, v12, v13, v28, v29, v30, v31, v10, v11, v14, v15, v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v20, v23, 0, 1, v23, 0, 1, v23, 2, 3, v23, 2, 3, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1 + qq_butterfly_mix_rev v0, v1, v2, v3, v16, v17, v18, v19, v4, v5, v6, v7, v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 qq_butterfly_top v8, v9, v10, v11, v28, v29, v30, v31, v12, v13, v14, v15, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 - mov v20.S[2], invNWR2ph - mov v20.S[3], invNWR2dp - qq_sub_add v16, v17, v18, v19, v28, v29, v30, v31, v0, v2, v4, v6, v8, v10, v12, v14 qq_sub_add v0, v2, v4, v6, v8, v10, v12, v14, v1, v3, v5, v7, v9, v11, v13, v15 - qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 - mov v20.S[2], invNR2ph mov v20.S[3], invNR2dp qq_montgomery_mul v1, v3, v5, v7, v0, v2, v4, v6, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 qq_montgomery_mul v0, v2, v4, v6, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + mov v20.S[2], invNWR2ph + mov v20.S[3], invNWR2dp + + qq_montgomery_mul v9, v11, v13, v15, v8, v10, v12, v14, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_montgomery_mul v8, v10, v12, v14, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + dup v29.4S, Q dup v30.4S, Qhalf dup v31.4S, nQhalf @@ -307,6 +281,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v0.4S, v16.4S, v29.4S mla v1.4S, v17.4S, v29.4S + str q0, [src, #0*64] + str q1, [src, #1*64] + cmge v18.4S, v31.4S, v2.4S cmge v19.4S, v31.4S, v3.4S cmge v16.4S, v2.4S, v30.4S @@ -318,6 +295,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v2.4S, v16.4S, v29.4S mla v3.4S, v17.4S, v29.4S + str q2, [src, #2*64] + str q3, [src, #3*64] + cmge v18.4S, v31.4S, v4.4S cmge v19.4S, v31.4S, v5.4S cmge v16.4S, v4.4S, v30.4S @@ -329,6 +309,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v4.4S, v16.4S, v29.4S mla v5.4S, v17.4S, v29.4S + str q4, [src, #4*64] + str q5, [src, #5*64] + cmge v18.4S, v31.4S, v6.4S cmge v19.4S, v31.4S, v7.4S cmge v16.4S, v6.4S, v30.4S @@ -340,6 +323,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v6.4S, v16.4S, v29.4S mla v7.4S, v17.4S, v29.4S + str q6, [src, #6*64] + str q7, [src, #7*64] + cmge v18.4S, v31.4S, v8.4S cmge v19.4S, v31.4S, v9.4S cmge v16.4S, v8.4S, v30.4S @@ -351,6 +337,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v8.4S, v16.4S, v29.4S mla v9.4S, v17.4S, v29.4S + str q8, [src, #8*64] + str q9, [src, #9*64] + cmge v18.4S, v31.4S, v10.4S cmge v19.4S, v31.4S, v11.4S cmge v16.4S, v10.4S, v30.4S @@ -362,6 +351,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v10.4S, v16.4S, v29.4S mla v11.4S, v17.4S, v29.4S + str q10, [src, #10*64] + str q11, [src, #11*64] + cmge v18.4S, v31.4S, v12.4S cmge v19.4S, v31.4S, v13.4S cmge v16.4S, v12.4S, v30.4S @@ -373,6 +365,9 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v12.4S, v16.4S, v29.4S mla v13.4S, v17.4S, v29.4S + str q12, [src, #12*64] + str q13, [src, #13*64] + cmge v18.4S, v31.4S, v14.4S cmge v19.4S, v31.4S, v15.4S cmge v16.4S, v14.4S, v30.4S @@ -384,26 +379,11 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: mla v14.4S, v16.4S, v29.4S mla v15.4S, v17.4S, v29.4S - sub counter, counter, #1 - cbnz counter, _intt_top_loop + str q14, [src, #14*64] + str q15, [src, #15*64] + + add src, src, #16 - st1 { v0.4S}, [ src0], #16 - st1 { v1.4S}, [ src1], #16 - st1 { v2.4S}, [ src2], #16 - st1 { v3.4S}, [ src3], #16 - st1 { v4.4S}, [ src4], #16 - st1 { v5.4S}, [ src5], #16 - st1 { v6.4S}, [ src6], #16 - st1 { v7.4S}, [ src7], #16 - - st1 { v8.4S}, [ src8], #16 - st1 { v9.4S}, [ src9], #16 - st1 {v10.4S}, [src10], #16 - st1 {v11.4S}, [src11], #16 - st1 {v12.4S}, [src12], #16 - st1 {v13.4S}, [src13], #16 - st1 {v14.4S}, [src14], #16 - st1 {v15.4S}, [src15], #16 .unreq Q .unreq Qhalf @@ -412,41 +392,23 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top: .unreq invNR2dp .unreq invNWR2ph .unreq invNWR2dp - .unreq src0 - .unreq src1 - .unreq src2 - .unreq src3 - .unreq src4 - .unreq src5 - .unreq src6 - .unreq src7 - .unreq src8 - .unreq src9 - .unreq src10 - .unreq src11 - .unreq src12 - .unreq src13 - .unreq src14 - .unreq src15 - .unreq table + .unreq src .unreq counter pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot -PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot: -_PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot +PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot: +_PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot: push_all Q .req w20 RphRdp .req x21 src0 .req x0 - des0 .req x1 src1 .req x2 - des1 .req x3 table0 .req x28 table1 .req x27 counter .req x19 @@ -457,72 +419,175 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot: add table0, x1, #128 add table1, table0, #1024 - add src1, src0, #512 + add src1, src0, #512 - add des0, src0, #0 - add des1, src0, #512 + ldr q8, [table0, #4*16] + ldr q9, [table0, #5*16] + ldr q10, [table0, #6*16] + ldr q11, [table0, #7*16] - mov counter, #8 - _intt_bot_loop: + ldr q24, [table1, #4*16] + ldr q25, [table1, #5*16] + ldr q26, [table1, #6*16] + ldr q27, [table1, #7*16] + + ldr q0, [src0, # 0*16] + ldr q1, [src0, # 1*16] + + ldr q16, [src1, # 0*16] + ldr q17, [src1, # 1*16] - ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 - ld4 { v16.4S, v17.4S, v18.4S, v19.4S}, [src1], #64 + ldr q2, [src0, # 2*16] + ldr q3, [src0, # 3*16] - ld1 { v4.4S, v5.4S}, [table0], #32 - ld2 { v6.4S, v7.4S}, [table0], #32 - ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [table0], #64 - ld1 { v20.4S, v21.4S}, [table1], #32 - ld2 { v22.4S, v23.4S}, [table1], #32 - ld4 { v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + ldr q18, [src1, # 2*16] + ldr q19, [src1, # 3*16] + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 mov v4.S[0], Q mov v20.D[0], RphRdp dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 - dq_butterfly_vec_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 - dq_butterfly_vec_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 - dq_butterfly_vec_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - dq_butterfly_vec_top v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 - trn_4x4 v0, v1, v2, v3, v12, v13, v14, v15 - trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + mov counter, #7 + _intt_bot_loop: + + dq_butterfly_vec_top_ltrn_4x4 \ + v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + table0, \ + q8, q9, q10, q11, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16), \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q24, q25, q26, q27, \ + #(128+4*16), #(128+5*16), #(128+6*16), #(128+7*16) dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 - dq_butterfly_mixed_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 - dq_butterfly_mixed_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 - dq_butterfly_mixed_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + str q2, [src0, # 2*16] srshr v14.4S, v0.4S, #23 + ldr q2, [src0, #(64+ 2*16)] + str q3, [src0, # 3*16] srshr v15.4S, v1.4S, #23 + ldr q3, [src0, #(64+ 3*16)] + str q18, [src1, # 2*16] srshr v30.4S, v16.4S, #23 + ldr q18, [src1, #(64+ 2*16)] + str q19, [src1, # 3*16] srshr v31.4S, v17.4S, #23 + ldr q19, [src1, #(64+ 3*16)] mls v0.4S, v14.4S, v4.S[0] + str q0, [src0, # 0*16] + ldr q0, [src0, #(64+ 0*16)] mls v1.4S, v15.4S, v4.S[0] + str q1, [src0, # 1*16] + ldr q1, [src0, #(64+ 1*16)] mls v16.4S, v30.4S, v4.S[0] + str q16, [src1, # 0*16] + ldr q16, [src1, #(64+ 0*16)] mls v17.4S, v31.4S, v4.S[0] + str q17, [src1, # 1*16] + ldr q17, [src1, #(64+ 1*16)] + + add table0, table0, #128 + add table1, table1, #128 - st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [des0], #64 - st1 { v16.4S, v17.4S, v18.4S, v19.4S}, [des1], #64 + add src0, src0, #64 + add src1, src1, #64 + + trn_4x4_l4 \ + v0, v1, v2, v3, v12, v13, v14, v15, \ + table0, \ + q4, q5, q6, q7, \ + #0*16, #1*16, #2*16, #3*16 + + trn_4x4_l4 \ + v16, v17, v18, v19, v28, v29, v30, v31, \ + table1, \ + q20, q21, q22, q23, \ + #0*16, #1*16, #2*16, #3*16 + + mov v4.S[0], Q + mov v20.D[0], RphRdp + + dq_butterfly_vec_bot v0, v2, v12, v13, v1, v3, v4, v8, v9, v10, v11 + dq_butterfly_vec_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v8, v9, v10, v11, v24, v25, v26, v27 + dq_butterfly_vec_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v24, v25, v26, v27, v6, v7, v6, v7 + dq_butterfly_vec_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v6, v7, v6, v7, v22, v23, v22, v23 sub counter, counter, #1 cbnz counter, _intt_bot_loop + dq_butterfly_vec_top_trn_4x4 \ + v16, v17, v28, v29, v18, v19, v4, v22, v23, v22, v23, \ + v0, v1, v2, v3, v12, v13, v14, v15 + + trn_4x4 v16, v17, v18, v19, v28, v29, v30, v31 + + dq_butterfly_bot v0, v2, v12, v13, v1, v3, v4, v5, 0, 1, v5, 2, 3 + dq_butterfly_mix_rev v0, v2, v12, v13, v1, v3, v16, v18, v28, v29, v17, v19, v4, v5, 0, 1, v5, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mix_rev v16, v18, v28, v29, v17, v19, v0, v1, v12, v13, v2, v3, v4, v21, 0, 1, v21, 2, 3, v4, 2, 3, v4, 2, 3 + dq_butterfly_mix_rev v0, v1, v12, v13, v2, v3, v16, v17, v28, v29, v18, v19, v4, v4, 2, 3, v4, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_top v16, v17, v28, v29, v18, v19, v4, v20, 2, 3, v20, 2, 3 + + str q2, [src0, # 2*16] + str q3, [src0, # 3*16] + str q18, [src1, # 2*16] + str q19, [src1, # 3*16] + + srshr v14.4S, v0.4S, #23 + srshr v15.4S, v1.4S, #23 + srshr v30.4S, v16.4S, #23 + srshr v31.4S, v17.4S, #23 + + mls v0.4S, v14.4S, v4.S[0] + mls v1.4S, v15.4S, v4.S[0] + mls v16.4S, v30.4S, v4.S[0] + mls v17.4S, v31.4S, v4.S[0] + + str q0, [src0, # 0*16] + str q1, [src0, # 1*16] + str q16, [src1, # 0*16] + str q17, [src1, # 1*16] + + add table0, table0, #128 + add table1, table1, #128 + + add src0, src0, #64 + add src1, src1, #64 + .unreq Q .unreq RphRdp .unreq src0 - .unreq des0 .unreq src1 - .unreq des1 .unreq table0 .unreq table1 .unreq counter pop_all - br lr - - + ret diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_poly.S b/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_poly.S index 49847b987..004d3ff32 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_poly.S +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/__asm_poly.S @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,10 +32,10 @@ #include "params.h" .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32 -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32 -PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: -_PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32 +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32 +PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32: +_PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32: mov x7, #16 _10_to_32_loop: @@ -45,7 +48,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: str w4, [x0], #4 ubfx w5, w2, #20, #10 str w5, [x0], #4 - lsr w6, w2, #30 + lsr w6, w2, #30 ldr w2, [x1], #4 @@ -99,13 +102,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32: sub x7, x7, #1 cbnz x7, _10_to_32_loop - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce: ldr w4, [x1] @@ -117,7 +120,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 - + ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 @@ -126,7 +129,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -135,7 +138,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -192,13 +195,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq: ldr w4, [x1] @@ -285,13 +288,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze: ldr w4, [x1] @@ -312,7 +315,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 - + srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 @@ -321,7 +324,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S @@ -330,7 +333,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 - + sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 @@ -339,7 +342,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S - + mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S @@ -414,13 +417,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze: st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round: mov w4, #1 @@ -560,13 +563,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round: st1 {v30.4S}, [x1], #16 st1 {v31.4S}, [x1], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_add: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -609,13 +612,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_add: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_sub: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -632,7 +635,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: mov x16, #15 _poly_sub_loop: - + st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 @@ -658,13 +661,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_sub: st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_shiftl: add x1, x0, #0 @@ -725,13 +728,13 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_shiftl: st1 {v22.4S}, [x0], #16 st1 {v23.4S}, [x0], #16 - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery -PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: -_PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery +PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery: +_PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery: push_all @@ -769,14 +772,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -819,14 +822,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -843,14 +846,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery: pop_all - br lr + ret .align 2 -.global PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery -.global _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery -PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: -_PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: +.global PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery +.global _PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery +PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery: +_PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery: push_all @@ -910,90 +913,90 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1027,14 +1030,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v27.4S, v23.4S, v31.4S ld1 { v7.4S}, [x2], #16 - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1064,90 +1067,90 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x7], #16 - ld1 { v4.4S}, [ x8], #16 + ld1 { v0.4S}, [ x7], #16 + ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x7], #16 - ld1 { v5.4S}, [ x8], #16 + ld1 { v1.4S}, [ x7], #16 + ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x7], #16 - ld1 { v6.4S}, [ x8], #16 + ld1 { v2.4S}, [ x7], #16 + ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x7], #16 + ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [ x9], #16 - ld1 { v4.4S}, [x10], #16 + ld1 { v0.4S}, [ x9], #16 + ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [ x9], #16 - ld1 { v5.4S}, [x10], #16 + ld1 { v1.4S}, [ x9], #16 + ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [ x9], #16 - ld1 { v6.4S}, [x10], #16 + ld1 { v2.4S}, [ x9], #16 + ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [ x9], #16 + ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x11], #16 - ld1 { v4.4S}, [x12], #16 + ld1 { v0.4S}, [x11], #16 + ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x11], #16 - ld1 { v5.4S}, [x12], #16 + ld1 { v1.4S}, [x11], #16 + ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x11], #16 - ld1 { v6.4S}, [x12], #16 + ld1 { v2.4S}, [x11], #16 + ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x11], #16 + ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x13], #16 - ld1 { v4.4S}, [x14], #16 + ld1 { v0.4S}, [x13], #16 + ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x13], #16 - ld1 { v5.4S}, [x14], #16 + ld1 { v1.4S}, [x13], #16 + ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x13], #16 - ld1 { v6.4S}, [x14], #16 + ld1 { v2.4S}, [x13], #16 + ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x13], #16 + ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S - ld1 { v0.4S}, [x15], #16 - ld1 { v4.4S}, [x19], #16 + ld1 { v0.4S}, [x15], #16 + ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S - ld1 { v1.4S}, [x15], #16 - ld1 { v5.4S}, [x19], #16 + ld1 { v1.4S}, [x15], #16 + ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S - ld1 { v2.4S}, [x15], #16 - ld1 { v6.4S}, [x19], #16 + ld1 { v2.4S}, [x15], #16 + ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S - ld1 { v3.4S}, [x15], #16 + ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif @@ -1173,14 +1176,14 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S - smlsl v12.2D, v24.2S, v30.2S - smlsl2 v16.2D, v24.4S, v30.4S - smlsl v13.2D, v25.2S, v30.2S - smlsl2 v17.2D, v25.4S, v30.4S - smlsl v14.2D, v26.2S, v30.2S - smlsl2 v18.2D, v26.4S, v30.4S - smlsl v15.2D, v27.2S, v30.2S - smlsl2 v19.2D, v27.4S, v30.4S + smlal v12.2D, v24.2S, v30.2S + smlal2 v16.2D, v24.4S, v30.4S + smlal v13.2D, v25.2S, v30.2S + smlal2 v17.2D, v25.4S, v30.4S + smlal v14.2D, v26.2S, v30.2S + smlal2 v18.2D, v26.4S, v30.4S + smlal v15.2D, v27.2S, v30.2S + smlal2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S @@ -1194,7 +1197,7 @@ _PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery: pop_all - br lr + ret diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/api.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/api.h index 5668ee3c6..c211dc659 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/api.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/api.h @@ -12,8 +12,8 @@ #define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES 2592 #define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES 4896 -#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES 4627 -#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_ALGNAME "Dilithium5" +#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES 4627 +#define PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_ALGNAME "Dilithium5" int PQCLEAN_DILITHIUM5_AARCH64_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/feat.S b/Modules/PQClean/crypto_sign/dilithium5/aarch64/feat.S deleted file mode 100644 index 01abc10a6..000000000 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/feat.S +++ /dev/null @@ -1,168 +0,0 @@ - -/* -MIT License - -Copyright (c) 2020 Bas Westerbaan -Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) - -.macro round - ; Execute theta, but without xoring into the state yet. - ; Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 ; d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 ; d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 ; d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 ; d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 ; d[2] = rotl(p[3], 1) ^ p[1] - - ; Xor parities from step theta into the state at the same time - ; as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 - - ; Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 - - ; iota - ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 -.endm - -.align 4 -.global PQCLEAN_DILITHIUM5_AARCH64_f1600x2 -.global _PQCLEAN_DILITHIUM5_AARCH64_f1600x2 -PQCLEAN_DILITHIUM5_AARCH64_f1600x2: -_PQCLEAN_DILITHIUM5_AARCH64_f1600x2: - stp d8, d9, [sp,#-16]! - stp d10, d11, [sp,#-16]! - stp d12, d13, [sp,#-16]! - stp d14, d15, [sp,#-16]! - - mov x2, x0 - mov x3, #24 - - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] - -loop: - round - - subs x3, x3, #1 - cbnz x3, loop - - mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] - - ldp d14, d15, [sp], #16 - ldp d12, d13, [sp], #16 - ldp d10, d11, [sp], #16 - ldp d8, d9, [sp], #16 - - ret lr - -#endif diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/fips202x2.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/fips202x2.c deleted file mode 100644 index 63761d23e..000000000 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/fips202x2.c +++ /dev/null @@ -1,684 +0,0 @@ - -/* - * This file was originally licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - * - * We choose - * CC0 1.0 Universal or the following MIT License for this file. - * - * MIT License - * - * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "fips202x2.h" - -#define NROUNDS 24 - -// Define NEON operation -// c = load(ptr) -#define vload(ptr) vld1q_u64(ptr); -// ptr <= c; -#define vstore(ptr, c) vst1q_u64(ptr, c); -// c = a ^ b -#define vxor(c, a, b) c = veorq_u64(a, b); -// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) -#define vROL(out, a, offset) \ - (out) = vshlq_n_u64(a, offset); \ - (out) = vsriq_n_u64(out, a, 64 - (offset)); -// Xor chain: out = a ^ b ^ c ^ d ^ e -#define vXOR4(out, a, b, c, d, e) \ - (out) = veorq_u64(a, b); \ - (out) = veorq_u64(out, c); \ - (out) = veorq_u64(out, d); \ - (out) = veorq_u64(out, e); -// Not And c = ~a & b -// #define vbic(c, a, b) c = vbicq_u64(b, a); -// Xor Not And: out = a ^ ( (~b) & c) -#define vXNA(out, a, b, c) \ - (out) = vbicq_u64(c, b); \ - (out) = veorq_u64(out, a); -// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support -#define vrxor(c, a, b) c = vrax1q_u64(a, b); -// End Define - -/* Keccak round constants */ -static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { - (uint64_t)0x0000000000000001ULL, - (uint64_t)0x0000000000008082ULL, - (uint64_t)0x800000000000808aULL, - (uint64_t)0x8000000080008000ULL, - (uint64_t)0x000000000000808bULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008009ULL, - (uint64_t)0x000000000000008aULL, - (uint64_t)0x0000000000000088ULL, - (uint64_t)0x0000000080008009ULL, - (uint64_t)0x000000008000000aULL, - (uint64_t)0x000000008000808bULL, - (uint64_t)0x800000000000008bULL, - (uint64_t)0x8000000000008089ULL, - (uint64_t)0x8000000000008003ULL, - (uint64_t)0x8000000000008002ULL, - (uint64_t)0x8000000000000080ULL, - (uint64_t)0x000000000000800aULL, - (uint64_t)0x800000008000000aULL, - (uint64_t)0x8000000080008081ULL, - (uint64_t)0x8000000000008080ULL, - (uint64_t)0x0000000080000001ULL, - (uint64_t)0x8000000080008008ULL -}; - -/************************************************* -* Name: KeccakF1600_StatePermutex2 -* -* Description: The Keccak F1600 Permutation -* -* Arguments: - uint64_t *state: pointer to input/output Keccak state -**************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_f1600x2(v128 *, const uint64_t *); -static inline -void KeccakF1600_StatePermutex2(v128 state[25]) { - #if (__APPLE__ && __ARM_FEATURE_CRYPTO) || (__ARM_FEATURE_SHA3) /* although not sure what is being implemented, we find something fast */ - PQCLEAN_DILITHIUM5_AARCH64_f1600x2(state, neon_KeccakF_RoundConstants); - #else - v128 Aba, Abe, Abi, Abo, Abu; - v128 Aga, Age, Agi, Ago, Agu; - v128 Aka, Ake, Aki, Ako, Aku; - v128 Ama, Ame, Ami, Amo, Amu; - v128 Asa, Ase, Asi, Aso, Asu; - v128 BCa, BCe, BCi, BCo, BCu; // tmp - v128 Da, De, Di, Do, Du; // D - v128 Eba, Ebe, Ebi, Ebo, Ebu; - v128 Ega, Ege, Egi, Ego, Egu; - v128 Eka, Eke, Eki, Eko, Eku; - v128 Ema, Eme, Emi, Emo, Emu; - v128 Esa, Ese, Esi, Eso, Esu; - - //copyFromState(A, state) - Aba = state[0]; - Abe = state[1]; - Abi = state[2]; - Abo = state[3]; - Abu = state[4]; - Aga = state[5]; - Age = state[6]; - Agi = state[7]; - Ago = state[8]; - Agu = state[9]; - Aka = state[10]; - Ake = state[11]; - Aki = state[12]; - Ako = state[13]; - Aku = state[14]; - Ama = state[15]; - Ame = state[16]; - Ami = state[17]; - Amo = state[18]; - Amu = state[19]; - Asa = state[20]; - Ase = state[21]; - Asi = state[22]; - Aso = state[23]; - Asu = state[24]; - - for (int round = 0; round < NROUNDS; round += 2) { - // prepareTheta - vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); - vXOR4(BCe, Abe, Age, Ake, Ame, Ase); - vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); - vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); - vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Aba, Aba, Da); - vxor(Age, Age, De); - vROL(BCe, Age, 44); - vxor(Aki, Aki, Di); - vROL(BCi, Aki, 43); - vxor(Amo, Amo, Do); - vROL(BCo, Amo, 21); - vxor(Asu, Asu, Du); - vROL(BCu, Asu, 14); - vXNA(Eba, Aba, BCe, BCi); - vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); - vXNA(Ebe, BCe, BCi, BCo); - vXNA(Ebi, BCi, BCo, BCu); - vXNA(Ebo, BCo, BCu, Aba); - vXNA(Ebu, BCu, Aba, BCe); - - vxor(Abo, Abo, Do); - vROL(BCa, Abo, 28); - vxor(Agu, Agu, Du); - vROL(BCe, Agu, 20); - vxor(Aka, Aka, Da); - vROL(BCi, Aka, 3); - vxor(Ame, Ame, De); - vROL(BCo, Ame, 45); - vxor(Asi, Asi, Di); - vROL(BCu, Asi, 61); - vXNA(Ega, BCa, BCe, BCi); - vXNA(Ege, BCe, BCi, BCo); - vXNA(Egi, BCi, BCo, BCu); - vXNA(Ego, BCo, BCu, BCa); - vXNA(Egu, BCu, BCa, BCe); - - vxor(Abe, Abe, De); - vROL(BCa, Abe, 1); - vxor(Agi, Agi, Di); - vROL(BCe, Agi, 6); - vxor(Ako, Ako, Do); - vROL(BCi, Ako, 25); - vxor(Amu, Amu, Du); - vROL(BCo, Amu, 8); - vxor(Asa, Asa, Da); - vROL(BCu, Asa, 18); - vXNA(Eka, BCa, BCe, BCi); - vXNA(Eke, BCe, BCi, BCo); - vXNA(Eki, BCi, BCo, BCu); - vXNA(Eko, BCo, BCu, BCa); - vXNA(Eku, BCu, BCa, BCe); - - vxor(Abu, Abu, Du); - vROL(BCa, Abu, 27); - vxor(Aga, Aga, Da); - vROL(BCe, Aga, 36); - vxor(Ake, Ake, De); - vROL(BCi, Ake, 10); - vxor(Ami, Ami, Di); - vROL(BCo, Ami, 15); - vxor(Aso, Aso, Do); - vROL(BCu, Aso, 56); - vXNA(Ema, BCa, BCe, BCi); - vXNA(Eme, BCe, BCi, BCo); - vXNA(Emi, BCi, BCo, BCu); - vXNA(Emo, BCo, BCu, BCa); - vXNA(Emu, BCu, BCa, BCe); - - vxor(Abi, Abi, Di); - vROL(BCa, Abi, 62); - vxor(Ago, Ago, Do); - vROL(BCe, Ago, 55); - vxor(Aku, Aku, Du); - vROL(BCi, Aku, 39); - vxor(Ama, Ama, Da); - vROL(BCo, Ama, 41); - vxor(Ase, Ase, De); - vROL(BCu, Ase, 2); - vXNA(Esa, BCa, BCe, BCi); - vXNA(Ese, BCe, BCi, BCo); - vXNA(Esi, BCi, BCo, BCu); - vXNA(Eso, BCo, BCu, BCa); - vXNA(Esu, BCu, BCa, BCe); - - // Next Round - - // prepareTheta - vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); - vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); - vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); - vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); - vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - vROL(Da, BCe, 1); - vxor(Da, BCu, Da); - vROL(De, BCi, 1); - vxor(De, BCa, De); - vROL(Di, BCo, 1); - vxor(Di, BCe, Di); - vROL(Do, BCu, 1); - vxor(Do, BCi, Do); - vROL(Du, BCa, 1); - vxor(Du, BCo, Du); - - vxor(Eba, Eba, Da); - vxor(Ege, Ege, De); - vROL(BCe, Ege, 44); - vxor(Eki, Eki, Di); - vROL(BCi, Eki, 43); - vxor(Emo, Emo, Do); - vROL(BCo, Emo, 21); - vxor(Esu, Esu, Du); - vROL(BCu, Esu, 14); - vXNA(Aba, Eba, BCe, BCi); - vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); - vXNA(Abe, BCe, BCi, BCo); - vXNA(Abi, BCi, BCo, BCu); - vXNA(Abo, BCo, BCu, Eba); - vXNA(Abu, BCu, Eba, BCe); - - vxor(Ebo, Ebo, Do); - vROL(BCa, Ebo, 28); - vxor(Egu, Egu, Du); - vROL(BCe, Egu, 20); - vxor(Eka, Eka, Da); - vROL(BCi, Eka, 3); - vxor(Eme, Eme, De); - vROL(BCo, Eme, 45); - vxor(Esi, Esi, Di); - vROL(BCu, Esi, 61); - vXNA(Aga, BCa, BCe, BCi); - vXNA(Age, BCe, BCi, BCo); - vXNA(Agi, BCi, BCo, BCu); - vXNA(Ago, BCo, BCu, BCa); - vXNA(Agu, BCu, BCa, BCe); - - vxor(Ebe, Ebe, De); - vROL(BCa, Ebe, 1); - vxor(Egi, Egi, Di); - vROL(BCe, Egi, 6); - vxor(Eko, Eko, Do); - vROL(BCi, Eko, 25); - vxor(Emu, Emu, Du); - vROL(BCo, Emu, 8); - vxor(Esa, Esa, Da); - vROL(BCu, Esa, 18); - vXNA(Aka, BCa, BCe, BCi); - vXNA(Ake, BCe, BCi, BCo); - vXNA(Aki, BCi, BCo, BCu); - vXNA(Ako, BCo, BCu, BCa); - vXNA(Aku, BCu, BCa, BCe); - - vxor(Ebu, Ebu, Du); - vROL(BCa, Ebu, 27); - vxor(Ega, Ega, Da); - vROL(BCe, Ega, 36); - vxor(Eke, Eke, De); - vROL(BCi, Eke, 10); - vxor(Emi, Emi, Di); - vROL(BCo, Emi, 15); - vxor(Eso, Eso, Do); - vROL(BCu, Eso, 56); - vXNA(Ama, BCa, BCe, BCi); - vXNA(Ame, BCe, BCi, BCo); - vXNA(Ami, BCi, BCo, BCu); - vXNA(Amo, BCo, BCu, BCa); - vXNA(Amu, BCu, BCa, BCe); - - vxor(Ebi, Ebi, Di); - vROL(BCa, Ebi, 62); - vxor(Ego, Ego, Do); - vROL(BCe, Ego, 55); - vxor(Eku, Eku, Du); - vROL(BCi, Eku, 39); - vxor(Ema, Ema, Da); - vROL(BCo, Ema, 41); - vxor(Ese, Ese, De); - vROL(BCu, Ese, 2); - vXNA(Asa, BCa, BCe, BCi); - vXNA(Ase, BCe, BCi, BCo); - vXNA(Asi, BCi, BCo, BCu); - vXNA(Aso, BCo, BCu, BCa); - vXNA(Asu, BCu, BCa, BCe); - } - - state[0] = Aba; - state[1] = Abe; - state[2] = Abi; - state[3] = Abo; - state[4] = Abu; - state[5] = Aga; - state[6] = Age; - state[7] = Agi; - state[8] = Ago; - state[9] = Agu; - state[10] = Aka; - state[11] = Ake; - state[12] = Aki; - state[13] = Ako; - state[14] = Aku; - state[15] = Ama; - state[16] = Ame; - state[17] = Ami; - state[18] = Amo; - state[19] = Amu; - state[20] = Asa; - state[21] = Ase; - state[22] = Asi; - state[23] = Aso; - state[24] = Asu; - #endif -} - -/************************************************* -* Name: keccakx2_absorb -* -* Description: Absorb step of Keccak; -* non-incremental, starts by zeroeing the state. -* -* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - const uint8_t *m: pointer to input to be absorbed into s -* - size_t mlen: length of input in bytes -* - uint8_t p: domain-separation byte for different -* Keccak-derived functions -**************************************************/ -static -void keccakx2_absorb(v128 s[25], - unsigned int r, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen, - uint8_t p) { - size_t i, pos = 0; - - // Declare SIMD registers - v128 tmp, mask; - uint64x1_t a, b; - uint64x2_t a1, b1, atmp1, btmp1; - uint64x2x2_t a2, b2, atmp2, btmp2; - // End - - for (i = 0; i < 25; ++i) { - s[i] = vdupq_n_u64(0); - } - - // Load in0[i] to register, then in1[i] to register, exchange them - while (inlen >= r) { - for (i = 0; i < r / 8 - 1; i += 4) { - a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); - b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); - atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); - // AC = zip2(AB and CD) - btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); - btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); - - vxor(s[i + 0], s[i + 0], atmp2.val[0]); - vxor(s[i + 1], s[i + 1], btmp2.val[0]); - vxor(s[i + 2], s[i + 2], atmp2.val[1]); - vxor(s[i + 3], s[i + 3], btmp2.val[1]); - - pos += 8 * 2 * 2; - } - // Last iteration - i = r / 8 - 1; - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - pos += 8; - - KeccakF1600_StatePermutex2(s); - inlen -= r; - } - - i = 0; - while (inlen >= 16) { - a1 = vld1q_u64((uint64_t *)&in0[pos]); - b1 = vld1q_u64((uint64_t *)&in1[pos]); - // BD = zip1(AB and CD) - atmp1 = vzip1q_u64(a1, b1); - // AC = zip2(AB and CD) - btmp1 = vzip2q_u64(a1, b1); - - vxor(s[i + 0], s[i + 0], atmp1); - vxor(s[i + 1], s[i + 1], btmp1); - - i += 2; - pos += 8 * 2; - inlen -= 8 * 2; - } - - if (inlen >= 8) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - vxor(s[i], s[i], tmp); - - i++; - pos += 8; - inlen -= 8; - } - - if (inlen) { - a = vld1_u64((uint64_t *)&in0[pos]); - b = vld1_u64((uint64_t *)&in1[pos]); - tmp = vcombine_u64(a, b); - mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); - tmp = vandq_u64(tmp, mask); - vxor(s[i], s[i], tmp); - } - - tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); - vxor(s[i], s[i], tmp); - - mask = vdupq_n_u64(1ULL << 63); - vxor(s[r / 8 - 1], s[r / 8 - 1], mask); -} - -/************************************************* -* Name: keccak_squeezeblocks -* -* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -* Modifies the state. Can be called multiple times to keep -* squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed (written to h) -* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) -* - uint64_t *s: pointer to input/output Keccak state -**************************************************/ -static -void keccakx2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - unsigned int r, - v128 s[25]) { - unsigned int i; - - uint64x1_t a, b; - uint64x2x2_t a2, b2; - - while (nblocks > 0) { - KeccakF1600_StatePermutex2(s); - - for (i = 0; i < r / 8 - 1; i += 4) { - a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); - b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); - a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); - b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); - vst1q_u64_x2((uint64_t *)out0, a2); - vst1q_u64_x2((uint64_t *)out1, b2); - - out0 += 32; - out1 += 32; - } - - i = r / 8 - 1; - // Last iteration - a = vget_low_u64(s[i]); - b = vget_high_u64(s[i]); - vst1_u64((uint64_t *)out0, a); - vst1_u64((uint64_t *)out1, b); - - out0 += 8; - out1 += 8; - - --nblocks; - } -} - -/************************************************* -* Name: shake128x2_absorb -* -* Description: Absorb step of the SHAKE128 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *state: pointer to (uninitialized) output -* Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -* SHAKE128_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); -} - -/************************************************* -* Name: shake256_absorb -* -* Description: Absorb step of the SHAKE256 XOF. -* non-incremental, starts by zeroeing the state. -* -* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *in: pointer to input to be absorbed into s -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); -} - -/************************************************* -* Name: shake256_squeezeblocks -* -* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of -* SHAKE256_RATE bytes each. Modifies the state. Can be called -* multiple times to keep squeezing, i.e., is incremental. -* -* Arguments: - uint8_t *out: pointer to output blocks -* - size_t nblocks: number of blocks to be squeezed -* (written to output) -* - keccakx2_state *s: pointer to input/output Keccak state -**************************************************/ -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state) { - keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); -} - -/************************************************* -* Name: shake128 -* -* Description: SHAKE128 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE128_RATE; - uint8_t t[2][SHAKE128_RATE]; - keccakx2_state state; - - shake128x2_absorb(&state, in0, in1, inlen); - shake128x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE128_RATE; - out1 += nblocks * SHAKE128_RATE; - outlen -= nblocks * SHAKE128_RATE; - - if (outlen) { - shake128x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} - -/************************************************* -* Name: shake256 -* -* Description: SHAKE256 XOF with non-incremental API -* -* Arguments: - uint8_t *out: pointer to output -* - size_t outlen: requested output length in bytes -* - const uint8_t *in: pointer to input -* - size_t inlen: length of input in bytes -**************************************************/ -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen) { - unsigned int i; - size_t nblocks = outlen / SHAKE256_RATE; - uint8_t t[2][SHAKE256_RATE]; - keccakx2_state state; - - shake256x2_absorb(&state, in0, in1, inlen); - shake256x2_squeezeblocks(out0, out1, nblocks, &state); - - out0 += nblocks * SHAKE256_RATE; - out1 += nblocks * SHAKE256_RATE; - outlen -= nblocks * SHAKE256_RATE; - - if (outlen) { - shake256x2_squeezeblocks(t[0], t[1], 1, &state); - for (i = 0; i < outlen; ++i) { - out0[i] = t[0][i]; - out1[i] = t[1][i]; - } - } -} diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/fips202x2.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/fips202x2.h deleted file mode 100644 index 28babbc39..000000000 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/fips202x2.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef FIPS202X2_H -#define FIPS202X2_H - -/* - * This file is licensed - * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) - * at https://github.com/GMUCERG/PQC_NEON/blob/main/neon/kyber or - * public domain at https://github.com/cothan/kyber/blob/master/neon - */ - -#include "params.h" -#include -#include - -typedef uint64x2_t v128; - -#define SHAKE128_RATE 168 -#define SHAKE256_RATE 136 -#define SHA3_256_RATE 136 -#define SHA3_512_RATE 72 - -typedef struct { - v128 s[25]; -} keccakx2_state; - -#define shake128x2_absorb DILITHIUM_NAMESPACE(shake128x2_absorb) -void shake128x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake128x2_squeezeblocks DILITHIUM_NAMESPACE(shake128x2_squeezeblocks) -void shake128x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake256x2_absorb DILITHIUM_NAMESPACE(shake256x2_absorb) -void shake256x2_absorb(keccakx2_state *state, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2_squeezeblocks DILITHIUM_NAMESPACE(shake256x2_squeezeblocks) -void shake256x2_squeezeblocks(uint8_t *out0, - uint8_t *out1, - size_t nblocks, - keccakx2_state *state); - -#define shake128x2 DILITHIUM_NAMESPACE(shake128x2) -void shake128x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); - -#define shake256x2 DILITHIUM_NAMESPACE(shake256x2) -void shake256x2(uint8_t *out0, - uint8_t *out1, - size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - size_t inlen); -#endif diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros.inc b/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros.inc index ef3af4c54..5504405c1 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros.inc +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,24 +30,254 @@ #include "macros_common.inc" -.macro wrap_trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3, qS, dD +.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - trn1 \t0\qS, \a0\qS, \a1\qS - trn2 \t1\qS, \a0\qS, \a1\qS - trn1 \t2\qS, \a2\qS, \a3\qS - trn2 \t3\qS, \a2\qS, \a3\qS + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S - trn1 \a0\dD, \t0\dD, \t2\dD - trn2 \a2\dD, \t0\dD, \t2\dD - trn1 \a1\dD, \t1\dD, \t3\dD - trn2 \a3\dD, \t1\dD, \t3\dD + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D .endm -.macro trn_4x4 a0, a1, a2, a3, t0, t1, t2, t3 - wrap_trn_4x4 \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, .4S, .2D + +.macro trn_4x4_l3 a0, a1, a2, a3, t0, t1, t2, t3, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\srcc_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\srcc_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\srcc_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_s4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_l4 a0, a1, a2, a3, t0, t1, t2, t3, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2l4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + ldr \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + ldr \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + ldr \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +.macro trn_4x4_2s4 a0, a1, a2, a3, t0, t1, t2, t3, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\src0_ptr, \memc0] + trn1 \t0\().4S, \a0\().4S, \a1\().4S + trn2 \t1\().4S, \a0\().4S, \a1\().4S + str \c1, [\src1_ptr, \memc1] + trn1 \t2\().4S, \a2\().4S, \a3\().4S + trn2 \t3\().4S, \a2\().4S, \a3\().4S + + str \c2, [\src0_ptr, \memc2] + trn1 \a0\().2D, \t0\().2D, \t2\().2D + trn2 \a2\().2D, \t0\().2D, \t2\().2D + str \c3, [\src1_ptr, \memc3] + trn1 \a1\().2D, \t1\().2D, \t3\().2D + trn2 \a3\().2D, \t1\().2D, \t3\().2D + +.endm + +// ==== 16-bit start ==== + +.macro qo_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oX_barrett \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q + wrap_qo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q + wrap_oo_barrett_vec \a0, \a1, \a2, \a3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \barrett_const, \shrv, \Q, .8H, .H +.endm + + +.macro qo_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_tops \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + +.macro qo_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_topsl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + + + +.macro qo_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botsl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_botsl_mul \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7 +.endm + + + +.macro qo_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + +.macro qo_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_butterfly_mixl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 .endm +.macro qo_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qo_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + +.macro qo_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .8H, .H +.endm + + +.macro do_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_2ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H, \src0_ptr, \src1_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro do_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 + wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .8H, .H +.endm + +.macro do_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \a4, \a5, \a6, \a7, \t4, \t5, \t6, \t7, \a8, \a9, \a10, \a11, \t8, \t9, \t10, \t11, \barrett_const, \shrv, \Q, .8H, .H +.endm + +.macro do_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mixl \a0, \a1, \b0, \b1, \t0, \t1, \b2, \b3, \t2, \t3, \mod, \l2, \h2, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .8H, .H +.endm + +.macro do_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l4 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro do_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + wrap_dX_butterfly_vec_mix_rev_l3 \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, .8H, .H, \srcc_ptr, \c1, \c2, \c3, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_montgomery_mul_in \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H +.endm + +.macro qo_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_inl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_qX_montgomery_mul_ins \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + +.macro qo_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_montgomery_mul_insl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .8H, .H, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +// ==== 16-bit end ==== + +// ==== 32-bit start ==== .macro dq_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1 wrap_dX_butterfly_vec_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S @@ -54,12 +287,20 @@ wrap_dX_butterfly_vec_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_trn_4x4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 .endm -.macro dq_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 - wrap_dX_butterfly_vec_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.macro dq_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + wrap_dX_butterfly_vec_top_ltrn_4x4 \b0, \b1, \t0, \t1, \mod, \l0, \h0, \l1, \h1, .4S, .S, \src0_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \trns0, \trns1, \trns2, \trns3, \trnt0, \trnt1, \trnt2, \trnt3 +.endm + +.macro dq_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_dX_butterfly_vec_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .4S, .S .endm @@ -67,16 +308,32 @@ wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm +.macro dq_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + wrap_dX_butterfly_topl4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3 +.endm + + +.macro dq_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_dX_butterfly_top2l4s4 \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S, \srcc_ptr, \c0, \c1, \memc0, \memc1, \srcd_ptr, \d0, \d1, \memd0, \memd1, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + + .macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S .endm -.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro dq_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + wrap_dX_butterfly_mixl6 \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \c4, \c5, \memc0, \memc1, \memc2, \memc3, \memc4, \memc5 .endm -.macro dq_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 - wrap_dX_butterfly_mixed_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.macro dq_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mix_rev \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm @@ -89,16 +346,48 @@ wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm +.macro qq_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + wrap_qX_butterfly_topl \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S, \src_ptr, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 +.endm + + + .macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S .endm -.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_botss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_botsls \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 +.endm + + + +.macro qq_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro qq_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + wrap_qX_butterfly_mixll \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixss \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3 +.endm + +.macro qq_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + wrap_qX_butterfly_mixssl \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S, \srcc_ptr, \c0, \c1, \c2, \c3, \memc0, \memc1, \memc2, \memc3, \srcd_ptr, \d0, \d1, \d2, \d3, \memd0, \memd1, \memd2, \memd3, \srce_ptr, \e0, \e1, \e2, \e3, \meme0, \meme1, \meme2, \meme3 .endm -.macro qq_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 - wrap_qX_butterfly_mixed_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.macro qq_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mix_rev \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S .endm @@ -109,3 +398,5 @@ .macro qq_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 wrap_qX_sub_add \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S .endm + +// === 32-bit end ==== diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros_common.inc b/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros_common.inc index bd7e77eb9..07568491d 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros_common.inc +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/macros_common.inc @@ -1,10 +1,13 @@ /* - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,35 +28,58 @@ * SOFTWARE. */ +#ifndef MACROS_COMMON +#define MACROS_COMMON + // for ABI .macro push_all - sub sp, sp, #(16*9) - stp x19, x20, [sp, #16*0] - stp x21, x22, [sp, #16*1] - stp x23, x24, [sp, #16*2] - stp x25, x26, [sp, #16*3] - stp x27, x28, [sp, #16*4] - stp d8, d9, [sp, #16*5] - stp d10, d11, [sp, #16*6] - stp d12, d13, [sp, #16*7] - stp d14, d15, [sp, #16*8] + sub sp, sp, #(9*16) + stp x19, x20, [sp, #0*16] + stp x21, x22, [sp, #1*16] + stp x23, x24, [sp, #2*16] + stp x25, x26, [sp, #3*16] + stp x27, x28, [sp, #4*16] + stp d8, d9, [sp, #5*16] + stp d10, d11, [sp, #6*16] + stp d12, d13, [sp, #7*16] + stp d14, d15, [sp, #8*16] .endm .macro pop_all - ldp x19, x20, [sp, #16*0] - ldp x21, x22, [sp, #16*1] - ldp x23, x24, [sp, #16*2] - ldp x25, x26, [sp, #16*3] - ldp x27, x28, [sp, #16*4] - ldp d8, d9, [sp, #16*5] - ldp d10, d11, [sp, #16*6] - ldp d12, d13, [sp, #16*7] - ldp d14, d15, [sp, #16*8] - add sp, sp, #(16*9) + ldp x19, x20, [sp, #0*16] + ldp x21, x22, [sp, #1*16] + ldp x23, x24, [sp, #2*16] + ldp x25, x26, [sp, #3*16] + ldp x27, x28, [sp, #4*16] + ldp d8, d9, [sp, #5*16] + ldp d10, d11, [sp, #6*16] + ldp d12, d13, [sp, #7*16] + ldp d14, d15, [sp, #8*16] + add sp, sp, #(9*16) + +.endm + +.macro push_simd + + sub sp, sp, #(4*16) + stp d8, d9, [sp, #0*16] + stp d10, d11, [sp, #1*16] + stp d12, d13, [sp, #2*16] + stp d14, d15, [sp, #3*16] + +.endm + +.macro pop_simd + + ldp d8, d9, [sp, #0*16] + ldp d10, d11, [sp, #1*16] + ldp d12, d13, [sp, #2*16] + ldp d14, d15, [sp, #3*16] + add sp, sp, #(4*16) .endm @@ -72,6 +98,45 @@ .endm +.macro wrap_dX_butterfly_topl4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, src_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\src_ptr, \memc0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\src_ptr, \memc1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \c2, [\src_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c3, [\src_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_top2l4s4 a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX, srcc_ptr, c0, c1, memc0, memc1, srcd_ptr, d0, d1, memd0, memd1, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + ldr \c0, [\srcc_ptr, \memc0] + str \e0, [\srce_ptr, \meme0] + mul \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + str \e1, [\srce_ptr, \meme1] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + ldr \d0, [\srcd_ptr, \memd0] + str \e2, [\srce_ptr, \meme2] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \d1, [\srcd_ptr, \memd1] + str \e3, [\srce_ptr, \meme3] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + + .macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -82,7 +147,7 @@ .endm -.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \z2\nX[\h2] @@ -99,7 +164,30 @@ .endm -.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX +.macro wrap_dX_butterfly_mixl6 a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, c4, c5, memc0, memc1, memc2, memc3, memc4, memc5 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c2, [\srcc_ptr, \memc2] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\srcc_ptr, \memc3] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + ldr \c4, [\srcc_ptr, \memc4] + mls \t2\wX, \b2\wX, \mod\nX[0] + ldr \c5, [\srcc_ptr, \memc5] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b2\wX, \a2\wX, \t2\wX @@ -135,6 +223,79 @@ .endm +.macro wrap_qX_butterfly_topl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + ldr \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + ldr \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + ldr \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + ldr \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_tops b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, src_ptr, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\src_ptr, \mem0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\src_ptr, \mem1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\src_ptr, \mem2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\src_ptr, \mem3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_topsl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + .macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -149,7 +310,134 @@ .endm -.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_botll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + ldr \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + ldr \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + ldr \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + ldr \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + +.endm + +.macro wrap_qX_butterfly_botsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + + add \a0\wX, \a0\wX, \t0\wX + str \e0, [\srce_ptr, \meme0] + add \a1\wX, \a1\wX, \t1\wX + str \e1, [\srce_ptr, \meme1] + add \a2\wX, \a2\wX, \t2\wX + str \e2, [\srce_ptr, \meme2] + add \a3\wX, \a3\wX, \t3\wX + str \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_botsl_mul a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, a4, a5, a6, a7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + + sqrdmulh \t4\wX, \a4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t5\wX, \a5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t6\wX, \a6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t7\wX, \a7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + mul \a4\wX, \a4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + ldr \d0, [\srcd_ptr, \memd0] + mul \a5\wX, \a5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + ldr \d1, [\srcd_ptr, \memd1] + mul \a6\wX, \a6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + ldr \d2, [\srcd_ptr, \memd2] + mul \a7\wX, \a7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + ldr \d3, [\srcd_ptr, \memd3] + + mls \a4\wX, \t4\wX, \mod\nX[0] + mls \a5\wX, \t5\wX, \mod\nX[0] + mls \a6\wX, \t6\wX, \mod\nX[0] + mls \a7\wX, \t7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mix a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX sub \b0\wX, \a0\wX, \t0\wX mul \t4\wX, \b4\wX, \z4\nX[\h4] @@ -176,7 +464,186 @@ .endm -.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX +.macro wrap_qX_butterfly_mixl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixll a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + ldr \c0, [\srcc_ptr, \memc0] + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + ldr \c1, [\srcc_ptr, \memc1] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + ldr \c2, [\srcc_ptr, \memc2] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + ldr \c3, [\srcc_ptr, \memc3] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixss a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixsls a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + ldr \d0, [\srcd_ptr, \memd0] + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + ldr \d1, [\srcd_ptr, \memd1] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + ldr \d2, [\srcd_ptr, \memd2] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + ldr \d3, [\srcd_ptr, \memd3] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + str \e0, [\srce_ptr, \meme0] + mls \t4\wX, \b4\wX, \mod\nX[0] + str \e1, [\srce_ptr, \meme1] + mls \t5\wX, \b5\wX, \mod\nX[0] + str \e2, [\srce_ptr, \meme2] + mls \t6\wX, \b6\wX, \mod\nX[0] + str \e3, [\srce_ptr, \meme3] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixssl a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, b4, b5, b6, b7, t4, t5, t6, t7, mod, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3, srce_ptr, e0, e1, e2, e3, meme0, meme1, meme2, meme3 + + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b0\wX, \a0\wX, \t0\wX + str \c0, [\srcc_ptr, \memc0] + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b1\wX, \a1\wX, \t1\wX + str \c1, [\srcc_ptr, \memc1] + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b2\wX, \a2\wX, \t2\wX + str \c2, [\srcc_ptr, \memc2] + mul \t7\wX, \b7\wX, \z7\nX[\h7] + sub \b3\wX, \a3\wX, \t3\wX + str \c3, [\srcc_ptr, \memc3] + + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a0\wX, \a0\wX, \t0\wX + str \d0, [\srcd_ptr, \memd0] + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a1\wX, \a1\wX, \t1\wX + str \d1, [\srcd_ptr, \memd1] + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a2\wX, \a2\wX, \t2\wX + str \d2, [\srcd_ptr, \memd2] + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + add \a3\wX, \a3\wX, \t3\wX + str \d3, [\srcd_ptr, \memd3] + + mls \t4\wX, \b4\wX, \mod\nX[0] + ldr \e0, [\srce_ptr, \meme0] + mls \t5\wX, \b5\wX, \mod\nX[0] + ldr \e1, [\srce_ptr, \meme1] + mls \t6\wX, \b6\wX, \mod\nX[0] + ldr \e2, [\srce_ptr, \meme2] + mls \t7\wX, \b7\wX, \mod\nX[0] + ldr \e3, [\srce_ptr, \meme3] + +.endm + +.macro wrap_qX_butterfly_mix_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX mul \t0\wX, \b0\wX, \z0\nX[\h0] sub \b4\wX, \a4\wX, \t4\wX @@ -218,6 +685,80 @@ .endm +.macro wrap_dX_butterfly_vec_top_trn_4x4 a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + mul \t0\wX, \b0\wX, \h0\wX + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + mul \t1\wX, \b1\wX, \h1\wX + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src0_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src0_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + +.macro wrap_dX_butterfly_vec_top_2ltrn_4x4 b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX, src0_ptr, src1_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, trns0, trns1, trns2, trns3, trnt0, trnt1, trnt2, trnt3 + + ldr \c0, [\src0_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + ldr \c1, [\src1_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + + ldr \c2, [\src0_ptr, \memc2] + trn1 \trnt0\().4S, \trns0\().4S, \trns1\().4S + trn2 \trnt1\().4S, \trns0\().4S, \trns1\().4S + ldr \c3, [\src1_ptr, \memc3] + trn1 \trnt2\().4S, \trns2\().4S, \trns3\().4S + trn2 \trnt3\().4S, \trns2\().4S, \trns3\().4S + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + trn1 \trns0\().2D, \trnt0\().2D, \trnt2\().2D + sqrdmulh \b1\wX, \b1\wX, \l1\wX + trn2 \trns2\().2D, \trnt0\().2D, \trnt2\().2D + + mls \t0\wX, \b0\wX, \mod\nX[0] + trn1 \trns1\().2D, \trnt1\().2D, \trnt3\().2D + mls \t1\wX, \b1\wX, \mod\nX[0] + trn2 \trns3\().2D, \trnt1\().2D, \trnt3\().2D + +.endm + .macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX sub \b0\wX, \a0\wX, \t0\wX @@ -228,15 +769,82 @@ .endm -.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_bot_oo_barrett_trn_4x4 a0, a1, b0, b1, t0, t1, a4, a5, a6, a7, t4, t5, t6, t7, a8, a9, a10, a11, t8, t9, t10, t11, barrett_const, shrv, Q, wX, nX + + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] + sub \b0\wX, \a0\wX, \t0\wX + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] + sub \b1\wX, \a1\wX, \t1\wX + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] + add \a0\wX, \a0\wX, \t0\wX + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] + add \a1\wX, \a1\wX, \t1\wX + + sqdmulh \t8\wX, \a8\wX, \barrett_const\nX[1] + srshr \t4\wX, \t4\wX, \shrv + sqdmulh \t9\wX, \a9\wX, \barrett_const\nX[1] + srshr \t5\wX, \t5\wX, \shrv + sqdmulh \t10\wX, \a10\wX, \barrett_const\nX[1] + srshr \t6\wX, \t6\wX, \shrv + sqdmulh \t11\wX, \a11\wX, \barrett_const\nX[1] + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\nX[0] + srshr \t8\wX, \t8\wX, \shrv + mls \a5\wX, \t5\wX, \Q\nX[0] + srshr \t9\wX, \t9\wX, \shrv + mls \a6\wX, \t6\wX, \Q\nX[0] + srshr \t10\wX, \t10\wX, \shrv + mls \a7\wX, \t7\wX, \Q\nX[0] + srshr \t11\wX, \t11\wX, \shrv + + mls \a8\wX, \t8\wX, \Q\nX[0] + trn1 \t4\().4S, \a4\().4S, \a5\().4S + mls \a9\wX, \t9\wX, \Q\nX[0] + trn2 \t5\().4S, \a4\().4S, \a5\().4S + mls \a10\wX, \t10\wX, \Q\nX[0] + trn1 \t6\().4S, \a6\().4S, \a7\().4S + mls \a11\wX, \t11\wX, \Q\nX[0] + + trn2 \t7\().4S, \a6\().4S, \a7\().4S + + trn1 \a4\().2D, \t4\().2D, \t6\().2D + trn2 \a6\().2D, \t4\().2D, \t6\().2D + trn1 \a5\().2D, \t5\().2D, \t7\().2D + trn2 \a7\().2D, \t5\().2D, \t7\().2D +.endm + +.macro wrap_dX_butterfly_vec_mix a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixl a0, a1, b0, b1, t0, t1, b2, b3, t2, t3, mod, l2, h2, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] sub \b0\wX, \a0\wX, \t0\wX mul \t2\wX, \b2\wX, \h2\wX + ldr \c1, [\srcc_ptr, \memc1] sub \b1\wX, \a1\wX, \t1\wX mul \t3\wX, \b3\wX, \h3\wX + ldr \c2, [\srcc_ptr, \memc2] add \a0\wX, \a0\wX, \t0\wX sqrdmulh \b2\wX, \b2\wX, \l2\wX + ldr \c3, [\srcc_ptr, \memc3] add \a1\wX, \a1\wX, \t1\wX sqrdmulh \b3\wX, \b3\wX, \l3\wX @@ -245,7 +853,7 @@ .endm -.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX +.macro wrap_dX_butterfly_vec_mix_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX mul \t0\wX, \b0\wX, \h0\wX sub \b2\wX, \a2\wX, \t2\wX @@ -262,57 +870,98 @@ .endm +.macro wrap_dX_butterfly_vec_mix_rev_l4 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mix_rev_l3 b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, wX, nX, srcc_ptr, c1, c2, c3, memc1, memc2, memc3 + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + ldr \c1, [\srcc_ptr, \memc1] + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + // vector-scalar Barrett reduction .macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv srshr \t2\wX, \t2\wX, \shrv - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t3\wX, \t3\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] - mls \a2\wX, \t2\wX, \Q\wX - mls \a3\wX, \t3\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] + mls \a3\wX, \t3\wX, \Q\nX[0] .endm .macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX - sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] - sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] - sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] - sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[1] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[1] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[1] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[1] srshr \t0\wX, \t0\wX, \shrv - sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[1] srshr \t1\wX, \t1\wX, \shrv - sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[1] srshr \t2\wX, \t2\wX, \shrv - sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[1] srshr \t3\wX, \t3\wX, \shrv - sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[1] - mls \a0\wX, \t0\wX, \Q\wX + mls \a0\wX, \t0\wX, \Q\nX[0] srshr \t4\wX, \t4\wX, \shrv - mls \a1\wX, \t1\wX, \Q\wX + mls \a1\wX, \t1\wX, \Q\nX[0] srshr \t5\wX, \t5\wX, \shrv - mls \a2\wX, \t2\wX, \Q\wX + mls \a2\wX, \t2\wX, \Q\nX[0] srshr \t6\wX, \t6\wX, \shrv - mls \a3\wX, \t3\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\nX[0] srshr \t7\wX, \t7\wX, \shrv - mls \a4\wX, \t4\wX, \Q\wX - mls \a5\wX, \t5\wX, \Q\wX - mls \a6\wX, \t6\wX, \Q\wX - mls \a7\wX, \t7\wX, \Q\wX + mls \a4\wX, \t4\wX, \Q\nX[0] + mls \a5\wX, \t5\wX, \Q\nX[0] + mls \a6\wX, \t6\wX, \Q\nX[0] + mls \a7\wX, \t7\wX, \Q\nX[0] .endm @@ -391,6 +1040,100 @@ .endm +.macro wrap_qX_montgomery_mul_in b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\l3] + + mul \b0\wX, \b0\wX, \z0\nX[\h0] + mul \b1\wX, \b1\wX, \z1\nX[\h1] + mul \b2\wX, \b2\wX, \z2\nX[\h2] + mul \b3\wX, \b3\wX, \z3\nX[\h3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_ins b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + str \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_inl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3 + + ldr \c0, [\srcc_ptr, \memc0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + ldr \c1, [\srcc_ptr, \memc1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + ldr \c2, [\srcc_ptr, \memc2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + ldr \c3, [\srcc_ptr, \memc3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_montgomery_mul_insl b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX, srcc_ptr, c0, c1, c2, c3, memc0, memc1, memc2, memc3, srcd_ptr, d0, d1, d2, d3, memd0, memd1, memd2, memd3 + + str \c0, [\srcc_ptr, \memc0] + ldr \d0, [\srcd_ptr, \memd0] + sqrdmulh \t0\wX, \b0\wX, \z0\nX[\h0] + str \c1, [\srcc_ptr, \memc1] + ldr \d1, [\srcd_ptr, \memd1] + sqrdmulh \t1\wX, \b1\wX, \z1\nX[\h1] + str \c2, [\srcc_ptr, \memc2] + ldr \d2, [\srcd_ptr, \memd2] + sqrdmulh \t2\wX, \b2\wX, \z2\nX[\h2] + str \c3, [\srcc_ptr, \memc3] + ldr \d3, [\srcd_ptr, \memd3] + sqrdmulh \t3\wX, \b3\wX, \z3\nX[\h3] + + mul \b0\wX, \b0\wX, \z0\nX[\l0] + mul \b1\wX, \b1\wX, \z1\nX[\l1] + mul \b2\wX, \b2\wX, \z2\nX[\l2] + mul \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + + + // Montgomery reduction with long .macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX @@ -448,3 +1191,10 @@ add \s3\wX, \a3\wX, \b3\wX .endm + + +#endif + + + + diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.c index 2d88c5d5e..ec594a77c 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.c +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,12 +33,26 @@ */ #include "params.h" -#include "reduce.h" +#include "NTT_params.h" +#include "ntt.h" #include #include -#include "NTT_params.h" -#include "ntt.h" +const __attribute__ ((aligned (16)))int32_t constants[16] = { + Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, + invNQ1R2modQ1_prime_half, + invNQ1R2modQ1_doubleprime, + invNQ1_final_R2modQ1_prime_half, + invNQ1_final_R2modQ1_doubleprime +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { + 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 8380417, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 327391679, -675340520, 987079667, 3073009, 1277625, -2635473, 3852015, 449207, -681503850, 681730119, -15156688, 1753, -2659525, 2660408, -59148, -495951789, -373072124, -456183549, 710479343, -1935420, -1455890, -1780227, 2772600, 8380417, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, -825844983, -799869667, -70227934, 4183372, -3222807, -3121440, -274060, 302950022, 163212680, -1013916752, -841760171, 1182243, 636927, -3956745, -3284915, 22347069, -1016110510, -588452222, -952468207, 87208, -3965306, -2296397, -3716946, 8380417, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 519705671, 496502727, -977780347, 2508980, 2028118, 1937570, -3815725, -7126831, 258649997, -507246529, -1013967746, -27812, 1009365, -1979497, -3956944, 210776307, -628875181, 409185979, -963363710, 822541, -2454145, 1596822, -3759465, 8380417, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, -764594519, -284313712, 1065510939, 2811291, -2983781, -1109516, 4158088, -431820817, 686309310, -909946047, -64176841, -1685153, 2678278, -3551006, -250446, -873958779, -965793731, 162963861, -629190881, -3410568, -3768948, 635956, -2455377, 8380417, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 123678909, 294395108, -759080783, 1528066, 482649, 1148858, -2962264, -1062481036, 561940831, 611800717, -68791907, -4146264, 2192938, 2387513, -268456, -454226054, -442566669, -925511710, -814992530, -1772588, -1727088, -3611750, -3180456, 8380417, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, 43482586, 631001801, -854436357, -565603, 169688, 2462444, -3334383, 960233614, 317727459, 818892658, 321386456, 3747250, 1239911, 3195676, 1254190, 588375860, -983611064, 677264190, -3181859, 2296099, -3838479, 2642980, -12417, 8380417, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -893898890, 509377762, -819295484, -4166425, -3488383, 1987814, -3197248, 768294260, -22883400, -347191365, -335754661, 2998219, -89301, -1354892, -1310261, 36345249, 643961400, 157142369, -568482643, 141835, 2513018, 613238, -2218467, 8380417, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 60323094, -832852657, 834980303, 1736313, 235407, -3250154, 3258457, -117552223, 1035301089, 522531086, -209807681, -458740, 4040196, 2039144, -818761, -492511373, -889718424, -481719139, -558360247, -1921994, -3472069, -1879878, -2178965, 8380417, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, 458160776, -612717067, -577774276, -2579253, 1787943, -2391089, -2254727, -415984810, -608441020, 150224382, 135295244, -1623354, -2374402, 586241, 527981, 539479988, -521163479, -302276083, -702999655, 2105286, -2033807, -1179613, -2743411, 8380417, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, -1071872863, -333129378, -605279149, 3482206, -4182915, -1300016, -2362063, -378477722, 638402564, 130156402, -185731180, -1476985, 2491325, 507927, -724804, 510974714, -356997292, -304395785, -470097680, 1994046, -1393159, -1187885, -1834526, 8380417, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, 630730945, 777970524, 159173408, -1317678, 2461387, 3035980, 621164, -777397036, 678549029, -669544140, 192079267, -3033742, 2647994, -2612853, 749577, -86720197, 771248568, 1063046068, -1030830548, -338420, 3009748, 4148469, -4022750, 8380417, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, -314332144, 749740976, 864652284, 3901472, -1226661, 2925816, 3374250, 1020029345, -413979908, 426738094, 298172236, 3980599, -1615530, 1665318, 1163598, 658309618, 441577800, 519685171, -863376927, 2569011, 1723229, 2028038, -3369273, 8380417, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, -711287812, 687588511, -712065019, 1356448, -2775755, 2683270, -2778788, 1023635298, -351195274, 861908357, 139752717, 3994671, -1370517, 3363542, 545376, -3043996, 773976352, 55063046, -197425671, -11879, 3020393, 214880, -770441, 8380417, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, 592665232, -167401858, -117660617, -3467665, 2312838, -653275, -459163, 795799901, 130212265, 220412084, 35937555, 3105558, 508145, 860144, 140244, -282732136, -141890356, 879049958, -388001774, -1103344, -553718, 3430436, -1514152, 8380417, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, -84011120, 259126110, -603268097, 348812, -327848, 1011223, -2354215, -559928242, 604333585, -772445769, 749801963, -2185084, 2358373, -3014420, 2926054, 800464680, -561979013, -439933955, -100631253, 3123762, -2193087, -1716814, -392707, 8380417, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -492577742, -573161516, 447030292, -3818627, -1922253, -2236726, 1744507, -77645096, -1018462631, 486888731, 270210213, -303005, -3974485, 1900052, 1054478, 904878186, -967019376, -200355636, -187430119, 3531229, -3773731, -781875, -731434 +}; + +const __attribute__ ((aligned (16)))int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1] = { + 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 8380417, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, 573161516, 492577742, 978523985, -1744507, 2236726, 1922253, 3818627, 187430119, 200355636, 967019376, -904878186, 731434, 781875, 3773731, -3531229, -270210213, -486888731, 1018462631, 77645096, -1054478, -1900052, 3974485, 303005, 8380417, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, -259126110, 84011120, -89383150, 2354215, -1011223, 327848, -348812, 100631253, 439933955, 561979013, -800464680, 392707, 1716814, 2193087, -3123762, -749801963, 772445769, -604333585, 559928242, -2926054, 3014420, -2358373, 2185084, 8380417, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 167401858, -592665232, 888589898, 459163, 653275, -2312838, 3467665, 388001774, -879049958, 141890356, 282732136, 1514152, -3430436, 553718, 1103344, -35937555, -220412084, -130212265, -795799901, -140244, -860144, -508145, -3105558, 8380417, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, -687588511, 711287812, -347590090, 2778788, -2683270, 2775755, -1356448, 197425671, -55063046, -773976352, 3043996, 770441, -214880, -3020393, 11879, -139752717, -861908357, 351195274, -1023635298, -545376, -3363542, 1370517, -3994671, 8380417, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -749740976, 314332144, -999753034, -3374250, -2925816, 1226661, -3901472, 863376927, -519685171, -441577800, -658309618, 3369273, -2028038, -1723229, -2569011, -298172236, -426738094, 413979908, -1020029345, -1163598, -1665318, 1615530, -3980599, 8380417, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -777970524, -630730945, 337655269, -621164, -3035980, -2461387, 1317678, 1030830548, -1063046068, -771248568, 86720197, 4022750, -4148469, -3009748, 338420, -192079267, 669544140, -678549029, 777397036, -749577, 2612853, -2647994, 3033742, 8380417, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 333129378, 1071872863, -892316032, 2362063, 1300016, 4182915, -3482206, 470097680, 304395785, 356997292, -510974714, 1834526, 1187885, 1393159, -1994046, 185731180, -130156402, -638402564, 378477722, 724804, -507927, -2491325, 1476985, 8380417, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 612717067, -458160776, 660934133, 2254727, 2391089, -1787943, 2579253, 702999655, 302276083, 521163479, -539479988, 2743411, 1179613, 2033807, -2105286, -135295244, -150224382, 608441020, 415984810, -527981, -586241, 2374402, 1623354, 8380417, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, 832852657, -60323094, -444930577, -3258457, 3250154, -235407, -1736313, 558360247, 481719139, 889718424, 492511373, 2178965, 1879878, 3472069, 1921994, 209807681, -522531086, -1035301089, 117552223, 818761, -2039144, -4040196, 458740, 8380417, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, -509377762, 893898890, 1067647297, 3197248, -1987814, 3488383, 4166425, 568482643, -157142369, -643961400, -36345249, 2218467, -613238, -2513018, -141835, 335754661, 347191365, 22883400, -768294260, 1310261, 1354892, 89301, -2998219, 8380417, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, -631001801, -43482586, 144935890, 3334383, -2462444, -169688, 565603, 3181859, -677264190, 983611064, -588375860, 12417, -2642980, 3838479, -2296099, -321386456, -818892658, -317727459, -960233614, -1254190, -3195676, -1239911, -3747250, 8380417, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, -294395108, -123678909, -391567239, 2962264, -1148858, -482649, -1528066, 814992530, 925511710, 442566669, 454226054, 3180456, 3611750, 1727088, 1772588, 68791907, -611800717, -561940831, 1062481036, 268456, -2387513, -2192938, 4146264, 8380417, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, 284313712, 764594519, -720393920, -4158088, 1109516, 2983781, -2811291, 629190881, -162963861, 965793731, 873958779, 2455377, -635956, 3768948, 3410568, 64176841, 909946047, -686309310, 431820817, 250446, 3551006, -2678278, 1685153, 8380417, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, -496502727, -519705671, -642926661, 3815725, -1937570, -2028118, -2508980, 963363710, -409185979, 628875181, -210776307, 3759465, -1596822, 2454145, -822541, 1013967746, 507246529, -258649997, 7126831, 3956944, 1979497, -1009365, 27812, 8380417, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 799869667, 825844983, -1071989969, 274060, 3121440, 3222807, -4183372, 952468207, 588452222, 1016110510, -22347069, 3716946, 2296397, 3965306, -87208, 841760171, 1013916752, -163212680, -302950022, 3284915, 3956745, -636927, -1182243, 8380417, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, 675340520, -327391679, -787459213, -3852015, 2635473, -1277625, -3073009, -710479343, 456183549, 373072124, 495951789, -2772600, 1780227, 1455890, 1935420, 15156688, -681730119, 681503850, -449207, 59148, -2660408, 2659525, -1753 +}; /************************************************* * Name: ntt diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.h index 79209c37a..8989c025a 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/ntt.h @@ -1,17 +1,19 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_NTT_H +#define PQCLEAN_DILITHIUM5_AARCH64_NTT_H /* * This file was originally licensed * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,45 +34,42 @@ * SOFTWARE. */ -#include "NTT_params.h" #include "params.h" +#include "NTT_params.h" #include -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot(int *des, const int *table, const int *_constants); +#define constants DILITHIUM_NAMESPACE(constants) +#define streamlined_CT_negacyclic_table_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_CT_negacyclic_table_Q1_jump_extended) +#define streamlined_GS_itable_Q1_jump_extended DILITHIUM_NAMESPACE(streamlined_GS_itable_Q1_jump_extended) + +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); + +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(int32_t *des, const int32_t *table, const int32_t *_constants); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(int32_t *des, const int32_t *table, const int32_t *_constants); + +extern +const int32_t constants[16]; -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top(int *des, const int *table, const int *_constants); -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot(int *des, const int *table, const int *_constants); +extern +const int32_t streamlined_CT_negacyclic_table_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define NTT(in) { \ - PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM5_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ - } +extern +const int32_t streamlined_GS_itable_Q1_jump_extended[((NTT_N - 1) + (1 << 0) + (1 << 4)) << 1]; -#define iNTT(in) { \ - PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants); \ - PQCLEAN_DILITHIUM5_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ - } +#define NTT(in) do { \ + PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM5_AARCH64__asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_jump_extended, constants); \ + } while(0) + +#define iNTT(in) do { \ + PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_bot(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + PQCLEAN_DILITHIUM5_AARCH64__asm_intt_SIMD_top(in, streamlined_GS_itable_Q1_jump_extended, constants); \ + } while(0) #define ntt DILITHIUM_NAMESPACE(ntt) void ntt(int32_t a[ARRAY_N]); #define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) void invntt_tomont(int32_t a[ARRAY_N]); -static const int constants[16] = { - Q1, -Q1prime, RmodQ1_prime_half, RmodQ1_doubleprime, - invNQ1R2modQ1_prime_half, - invNQ1R2modQ1_doubleprime, - invNQ1_final_R2modQ1_prime_half, - invNQ1_final_R2modQ1_doubleprime -}; - -static const int streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, -915382907, -3572223, 964937599, 3765607, 963888510, 3761513, -820383522, -3201494, -738955404, -2883726, -806080660, -3145678, -820367122, -3201430, -154181397, -601683, 907762539, 3542485, 687336873, 2682288, 545785280, 2129892, 964747974, 3764867, -257592709, -1005239, 142848732, 557458, -312926867, -1221177, 0, 0, -863652652, -3370349, 923069133, 3602218, 815613168, 3182878, 787459213, 3073009, 327391679, 1277625, -675340520, -2635473, 987079667, 3852015, 449207, 1753, -495951789, -1935420, -681503850, -2659525, -373072124, -1455890, 681730119, 2660408, -456183549, -1780227, -15156688, -59148, 710479343, 2772600, 0, 0, -1041158200, -4063053, 702264730, 2740543, -919027554, -3586446, 1071989969, 4183372, -825844983, -3222807, -799869667, -3121440, -70227934, -274060, 302950022, 1182243, 22347069, 87208, 163212680, 636927, -1016110510, -3965306, -1013916752, -3956745, -588452222, -2296397, -841760171, -3284915, -952468207, -3716946, 0, 0, 682491182, 2663378, -797147778, -3110818, 538486762, 2101410, 642926661, 2508980, 519705671, 2028118, 496502727, 1937570, -977780347, -3815725, -7126831, -27812, 210776307, 822541, 258649997, 1009365, -628875181, -2454145, -507246529, -1979497, 409185979, 1596822, -1013967746, -3956944, -963363710, -3759465, 0, 0, -429120452, -1674615, 949361686, 3704823, 297218217, 1159875, 720393920, 2811291, -764594519, -2983781, -284313712, -1109516, 1065510939, 4158088, -431820817, -1685153, -873958779, -3410568, 686309310, 2678278, -965793731, -3768948, -909946047, -3551006, 162963861, 635956, -64176841, -250446, -629190881, -2455377, 0, 0, -903139016, -3524442, 101000509, 394148, 237992130, 928749, 391567239, 1528066, 123678909, 482649, 294395108, 1148858, -759080783, -2962264, -1062481036, -4146264, -454226054, -1772588, 561940831, 2192938, -442566669, -1727088, 611800717, 2387513, -925511710, -3611750, -68791907, -268456, -814992530, -3180456, 0, 0, -111244624, -434125, 280713909, 1095468, -898510625, -3506380, -144935890, -565603, 43482586, 169688, 631001801, 2462444, -854436357, -3334383, 960233614, 3747250, 588375860, 2296099, 317727459, 1239911, -983611064, -3838479, 818892658, 3195676, 677264190, 2642980, 321386456, 1254190, -3181859, -12417, 0, 0, 173376332, 676590, 530906624, 2071829, -1029866791, -4018989, -1067647297, -4166425, -893898890, -3488383, 509377762, 1987814, -819295484, -3197248, 768294260, 2998219, 36345249, 141835, -22883400, -89301, 643961400, 2513018, -347191365, -1354892, 157142369, 613238, -335754661, -1310261, -568482643, -2218467, 0, 0, -342333886, -1335936, 830756018, 3241972, 552488273, 2156050, 444930577, 1736313, 60323094, 235407, -832852657, -3250154, 834980303, 3258457, -117552223, -458740, -492511373, -1921994, 1035301089, 4040196, -889718424, -3472069, 522531086, 2039144, -481719139, -1879878, -209807681, -818761, -558360247, -2178965, 0, 0, -827143915, -3227876, 875112161, 3415069, 450833045, 1759347, -660934133, -2579253, 458160776, 1787943, -612717067, -2391089, -577774276, -2254727, -415984810, -1623354, 539479988, 2105286, -608441020, -2374402, -521163479, -2033807, 150224382, 586241, -302276083, -1179613, 135295244, 527981, -702999655, -2743411, 0, 0, 439288460, 1714295, -209493775, -817536, -915957677, -3574466, 892316032, 3482206, -1071872863, -4182915, -333129378, -1300016, -605279149, -2362063, -378477722, -1476985, 510974714, 1994046, 638402564, 2491325, -356997292, -1393159, 130156402, 507927, -304395785, -1187885, -185731180, -724804, -470097680, -1834526, 0, 0, 628833668, 2453983, 962678241, 3756790, -496048908, -1935799, -337655269, -1317678, 630730945, 2461387, 777970524, 3035980, 159173408, 621164, -777397036, -3033742, -86720197, -338420, 678549029, 2647994, 771248568, 3009748, -669544140, -2612853, 1063046068, 4148469, 192079267, 749577, -1030830548, -4022750, 0, 0, 374309300, 1460718, -439978542, -1716988, -1012201926, -3950053, 999753034, 3901472, -314332144, -1226661, 749740976, 2925816, 864652284, 3374250, 1020029345, 3980599, 658309618, 2569011, -413979908, -1615530, 441577800, 1723229, 426738094, 1665318, 519685171, 2028038, 298172236, 1163598, -863376927, -3369273, 0, 0, -164673562, -642628, -742437332, -2897314, 818041395, 3192354, 347590090, 1356448, -711287812, -2775755, 687588511, 2683270, -712065019, -2778788, 1023635298, 3994671, -3043996, -11879, -351195274, -1370517, 773976352, 3020393, 861908357, 3363542, 55063046, 214880, 139752717, 545376, -197425671, -770441, 0, 0, -918682129, -3585098, 142694469, 556856, 991769559, 3870317, -888589898, -3467665, 592665232, 2312838, -167401858, -653275, -117660617, -459163, 795799901, 3105558, -282732136, -1103344, 130212265, 508145, -141890356, -553718, 220412084, 860144, 879049958, 3430436, 35937555, 140244, -388001774, -1514152, 0, 0, 721508096, 2815639, 747568486, 2917338, 475038184, 1853806, 89383150, 348812, -84011120, -327848, 259126110, 1011223, -603268097, -2354215, -559928242, -2185084, 800464680, 3123762, 604333585, 2358373, -561979013, -2193087, -772445769, -3014420, -439933955, -1716814, 749801963, 2926054, -100631253, -392707, 0, 0, 585207070, 2283733, 857403734, 3345963, 476219497, 1858416, -978523985, -3818627, -492577742, -1922253, -573161516, -2236726, 447030292, 1744507, -77645096, -303005, 904878186, 3531229, -1018462631, -3974485, -967019376, -3773731, 486888731, 1900052, -200355636, -781875, 270210213, 1054478, -187430119, -731434, 0, 0 -}; - -static const int streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 4)) << 1] = { - 0, 0, 915382907, 3572223, -963888510, -3761513, -964937599, -3765607, 820367122, 3201430, 806080660, 3145678, 738955404, 2883726, 820383522, 3201494, 312926867, 1221177, -142848732, -557458, 257592709, 1005239, -964747974, -3764867, -545785280, -2129892, -687336873, -2682288, -907762539, -3542485, 154181397, 601683, 0, 0, -585207070, -2283733, -476219497, -1858416, -857403734, -3345963, -447030292, -1744507, 573161516, 2236726, 492577742, 1922253, 978523985, 3818627, 187430119, 731434, -270210213, -1054478, 200355636, 781875, -486888731, -1900052, 967019376, 3773731, 1018462631, 3974485, -904878186, -3531229, 77645096, 303005, 0, 0, -721508096, -2815639, -475038184, -1853806, -747568486, -2917338, 603268097, 2354215, -259126110, -1011223, 84011120, 327848, -89383150, -348812, 100631253, 392707, -749801963, -2926054, 439933955, 1716814, 772445769, 3014420, 561979013, 2193087, -604333585, -2358373, -800464680, -3123762, 559928242, 2185084, 0, 0, 918682129, 3585098, -991769559, -3870317, -142694469, -556856, 117660617, 459163, 167401858, 653275, -592665232, -2312838, 888589898, 3467665, 388001774, 1514152, -35937555, -140244, -879049958, -3430436, -220412084, -860144, 141890356, 553718, -130212265, -508145, 282732136, 1103344, -795799901, -3105558, 0, 0, 164673562, 642628, -818041395, -3192354, 742437332, 2897314, 712065019, 2778788, -687588511, -2683270, 711287812, 2775755, -347590090, -1356448, 197425671, 770441, -139752717, -545376, -55063046, -214880, -861908357, -3363542, -773976352, -3020393, 351195274, 1370517, 3043996, 11879, -1023635298, -3994671, 0, 0, -374309300, -1460718, 1012201926, 3950053, 439978542, 1716988, -864652284, -3374250, -749740976, -2925816, 314332144, 1226661, -999753034, -3901472, 863376927, 3369273, -298172236, -1163598, -519685171, -2028038, -426738094, -1665318, -441577800, -1723229, 413979908, 1615530, -658309618, -2569011, -1020029345, -3980599, 0, 0, -628833668, -2453983, 496048908, 1935799, -962678241, -3756790, -159173408, -621164, -777970524, -3035980, -630730945, -2461387, 337655269, 1317678, 1030830548, 4022750, -192079267, -749577, -1063046068, -4148469, 669544140, 2612853, -771248568, -3009748, -678549029, -2647994, 86720197, 338420, 777397036, 3033742, 0, 0, -439288460, -1714295, 915957677, 3574466, 209493775, 817536, 605279149, 2362063, 333129378, 1300016, 1071872863, 4182915, -892316032, -3482206, 470097680, 1834526, 185731180, 724804, 304395785, 1187885, -130156402, -507927, 356997292, 1393159, -638402564, -2491325, -510974714, -1994046, 378477722, 1476985, 0, 0, 827143915, 3227876, -450833045, -1759347, -875112161, -3415069, 577774276, 2254727, 612717067, 2391089, -458160776, -1787943, 660934133, 2579253, 702999655, 2743411, -135295244, -527981, 302276083, 1179613, -150224382, -586241, 521163479, 2033807, 608441020, 2374402, -539479988, -2105286, 415984810, 1623354, 0, 0, 342333886, 1335936, -552488273, -2156050, -830756018, -3241972, -834980303, -3258457, 832852657, 3250154, -60323094, -235407, -444930577, -1736313, 558360247, 2178965, 209807681, 818761, 481719139, 1879878, -522531086, -2039144, 889718424, 3472069, -1035301089, -4040196, 492511373, 1921994, 117552223, 458740, 0, 0, -173376332, -676590, 1029866791, 4018989, -530906624, -2071829, 819295484, 3197248, -509377762, -1987814, 893898890, 3488383, 1067647297, 4166425, 568482643, 2218467, 335754661, 1310261, -157142369, -613238, 347191365, 1354892, -643961400, -2513018, 22883400, 89301, -36345249, -141835, -768294260, -2998219, 0, 0, 111244624, 434125, 898510625, 3506380, -280713909, -1095468, 854436357, 3334383, -631001801, -2462444, -43482586, -169688, 144935890, 565603, 3181859, 12417, -321386456, -1254190, -677264190, -2642980, -818892658, -3195676, 983611064, 3838479, -317727459, -1239911, -588375860, -2296099, -960233614, -3747250, 0, 0, 903139016, 3524442, -237992130, -928749, -101000509, -394148, 759080783, 2962264, -294395108, -1148858, -123678909, -482649, -391567239, -1528066, 814992530, 3180456, 68791907, 268456, 925511710, 3611750, -611800717, -2387513, 442566669, 1727088, -561940831, -2192938, 454226054, 1772588, 1062481036, 4146264, 0, 0, 429120452, 1674615, -297218217, -1159875, -949361686, -3704823, -1065510939, -4158088, 284313712, 1109516, 764594519, 2983781, -720393920, -2811291, 629190881, 2455377, 64176841, 250446, -162963861, -635956, 909946047, 3551006, 965793731, 3768948, -686309310, -2678278, 873958779, 3410568, 431820817, 1685153, 0, 0, -682491182, -2663378, -538486762, -2101410, 797147778, 3110818, 977780347, 3815725, -496502727, -1937570, -519705671, -2028118, -642926661, -2508980, 963363710, 3759465, 1013967746, 3956944, -409185979, -1596822, 507246529, 1979497, 628875181, 2454145, -258649997, -1009365, -210776307, -822541, 7126831, 27812, 0, 0, 1041158200, 4063053, 919027554, 3586446, -702264730, -2740543, 70227934, 274060, 799869667, 3121440, 825844983, 3222807, -1071989969, -4183372, 952468207, 3716946, 841760171, 3284915, 588452222, 2296397, 1013916752, 3956745, 1016110510, 3965306, -163212680, -636927, -22347069, -87208, -302950022, -1182243, 0, 0, 863652652, 3370349, -815613168, -3182878, -923069133, -3602218, -987079667, -3852015, 675340520, 2635473, -327391679, -1277625, -787459213, -3073009, -710479343, -2772600, 15156688, 59148, 456183549, 1780227, -681730119, -2660408, 373072124, 1455890, 681503850, 2659525, 495951789, 1935420, -449207, -1753, 0, 0 -}; - #endif diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.c index 8fa3b0ccb..0f033da1d 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.c +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.c @@ -19,7 +19,7 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1) { unsigned int i; @@ -45,7 +45,7 @@ void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], **************************************************/ void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) { + const uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -71,7 +71,7 @@ void unpack_pk(uint8_t rho[SEEDBYTES], * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 **************************************************/ -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -129,7 +129,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]) { + const uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES]) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -172,7 +172,7 @@ void unpack_sk(uint8_t rho[SEEDBYTES], * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h **************************************************/ -void pack_sig(uint8_t sig[CRYPTO_BYTES], +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h) { @@ -221,7 +221,7 @@ void pack_sig(uint8_t sig[CRYPTO_BYTES], int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, - const uint8_t sig[CRYPTO_BYTES]) { + const uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES]) { unsigned int i, j, k; for (i = 0; i < CTILDEBYTES; ++i) { diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.h index fb70ce5db..1d2c448a1 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/packing.h @@ -1,5 +1,5 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_PACKING_H +#define PQCLEAN_DILITHIUM5_AARCH64_PACKING_H /* * This file is dual licensed @@ -7,15 +7,16 @@ * or public domain at https://github.com/pq-crystals/dilithium */ +#include "api.h" #include "params.h" #include "polyvec.h" #include #define pack_pk DILITHIUM_NAMESPACE(pack_pk) -void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); +void pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); #define pack_sk DILITHIUM_NAMESPACE(pack_sk) -void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], +void pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES], const uint8_t rho[SEEDBYTES], const uint8_t tr[TRBYTES], const uint8_t key[SEEDBYTES], @@ -24,10 +25,10 @@ void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], const polyveck *s2); #define pack_sig DILITHIUM_NAMESPACE(pack_sig) -void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); +void pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); #define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) -void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES]); #define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) void unpack_sk(uint8_t rho[SEEDBYTES], @@ -36,9 +37,9 @@ void unpack_sk(uint8_t rho[SEEDBYTES], polyveck *t0, polyvecl *s1, polyveck *s2, - const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + const uint8_t sk[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_SECRETKEYBYTES]); #define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) -int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES]); #endif diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/params.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/params.h index 8c7adddef..71681fd81 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/params.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_PARAMS_H +#define PQCLEAN_DILITHIUM5_AARCH64_PARAMS_H /* * This file is dual licensed @@ -8,11 +8,11 @@ */ // #define DILITHIUM_MODE 2 -//#define DILITHIUM_MODE 3 +// #define DILITHIUM_MODE 3 #define DILITHIUM_MODE 5 -#define CRYPTO_NAMESPACETOP PQCLEAN_DILITHIUM5_AARCH64_crypto_sign #define CRYPTO_NAMESPACE(s) PQCLEAN_DILITHIUM5_AARCH64_##s +#define CRYPTO_NAMESPACETOP crypto_sign #define DILITHIUM_NAMESPACETOP CRYPTO_NAMESPACETOP #define DILITHIUM_NAMESPACE(s) CRYPTO_NAMESPACE(s) @@ -40,18 +40,20 @@ #define POLYT0_PACKEDBYTES 416 #define POLYVECH_PACKEDBYTES (OMEGA + K) +// GAMMA1 == (1 << 19) #define POLYZ_PACKEDBYTES 640 +// GAMMA2 == (DILITHIUM_Q-1)/88 #define POLYW1_PACKEDBYTES 128 #define POLYETA_PACKEDBYTES 96 -#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) -#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ - + TRBYTES \ - + L*POLYETA_PACKEDBYTES \ - + K*POLYETA_PACKEDBYTES \ - + K*POLYT0_PACKEDBYTES) -#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) +#define DILITHIUM_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define DILITHIUM_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define DILITHIUM_CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.c index 782e725a8..0d746a4c3 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.c +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -35,16 +37,9 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" -#include - -#include "fips202x2.h" - -#include "NTT_params.h" +#include "keccak2x/fips202x2.h" #include "ntt.h" - -static const int32_t montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; +#include #define DBENCH_START() #define DBENCH_STOP(t) @@ -57,11 +52,11 @@ static const int32_t montgomery_const[4] = { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(int32_t *, const int32_t *); void poly_reduce(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_reduce(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_reduce(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -74,11 +69,11 @@ void poly_reduce(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(int32_t *, const int32_t *); void poly_caddq(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_caddq(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_caddq(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -91,11 +86,11 @@ void poly_caddq(poly *a) { * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze(int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(int32_t *, const int32_t *); void poly_freeze(poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_freeze(a->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_freeze(a->coeffs, constants); DBENCH_STOP(*tred); } @@ -205,11 +200,11 @@ void poly_invntt_tomont(poly *a) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(int32_t *des, const int32_t *src1, const int32_t *src2, const int32_t *table); void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs, constants); DBENCH_STOP(*tmul); } @@ -226,11 +221,11 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { * - poly *a0: pointer to output polynomial with coefficients c0 * - const poly *a: pointer to input polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round(int32_t *, int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(int32_t *, int32_t *, const int32_t *); void poly_power2round(poly *a1, poly *a0, const poly *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); + PQCLEAN_DILITHIUM5_AARCH64__asm_poly_power2round(a1->coeffs, a0->coeffs, a->coeffs); DBENCH_STOP(*tround); } @@ -730,11 +725,11 @@ void polyt1_pack(uint8_t *r, const poly *a) { * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32(int32_t *, const uint8_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(int32_t *, const uint8_t *); void polyt1_unpack(poly *r, const uint8_t *a) { DBENCH_START(); - PQCLEAN_DILITHIUM5_AARCH64_asm_10_to_32(r->coeffs, a); + PQCLEAN_DILITHIUM5_AARCH64__asm_10_to_32(r->coeffs, a); DBENCH_STOP(*tpack); } diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.h index c253ecf69..158e3e2d5 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/poly.h @@ -1,5 +1,5 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_POLY_H +#define PQCLEAN_DILITHIUM5_AARCH64_POLY_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.c index e6ec99f92..ce4828e99 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.c +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.c @@ -4,12 +4,14 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * * Copyright (c) 2023: Hanno Becker, Vincent Hwang, Matthias J. Kannwischer, Bo-Yin Yang, and Shang-Yi Yang + * Copyright (c) 2023: Vincent Hwang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,13 +35,9 @@ #include "params.h" #include "poly.h" #include "polyvec.h" -#include - +#include "ntt.h" #include "reduce.h" - -static const int32_t l_montgomery_const[4] = { - DILITHIUM_Q, DILITHIUM_QINV -}; +#include /************************************************* * Name: expand_mat @@ -177,11 +175,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyve * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -extern void PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); +extern void PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(int32_t *, const int32_t *, const int32_t *, const int32_t *); void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { - PQCLEAN_DILITHIUM5_AARCH64_asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, l_montgomery_const); + PQCLEAN_DILITHIUM5_AARCH64__asm_polyvecl_pointwise_acc_montgomery(w->coeffs, u->vec[0].coeffs, v->vec[0].coeffs, constants); } /************************************************* diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.h index dc3377c93..d67f75903 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/polyvec.h @@ -1,5 +1,5 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_POLYVEC_H +#define PQCLEAN_DILITHIUM5_AARCH64_POLYVEC_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/reduce.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/reduce.h index 9042e6cb0..2be7f6a77 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/reduce.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_REDUCE_H +#define PQCLEAN_DILITHIUM5_AARCH64_REDUCE_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.c index 871c97595..c01432776 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.c +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.c @@ -95,7 +95,8 @@ int32_t use_hint(int32_t a, unsigned int hint) { if (a0 > 0) { return (a1 + 1) & 15; + } else { + return (a1 - 1) & 15; } - return (a1 - 1) & 15; } diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.h index 36167d2af..1f6be28bb 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_ROUNDING_H +#define PQCLEAN_DILITHIUM5_AARCH64_ROUNDING_H /* * This file is dual licensed diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.c index 3565b3704..156b994fc 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.c +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -90,7 +91,7 @@ int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { pack_pk(pk, rho, &t1); /* Compute H(rho, t1) and write secret key */ - shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(tr, TRBYTES, pk, PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES); pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; @@ -210,7 +211,7 @@ int crypto_sign_signature(uint8_t *sig, /* Write signature */ pack_sig(sig, sig, &z, &h); - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES; return 0; } @@ -238,9 +239,9 @@ int crypto_sign(uint8_t *sm, size_t i; for (i = 0; i < mlen; ++i) { - sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + sm[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; } - crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES, mlen, sk); *smlen += mlen; return 0; } @@ -274,7 +275,7 @@ int crypto_sign_verify(const uint8_t *sig, polyveck t1, w1, h; shake256incctx state; - if (siglen != CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES) { return -1; } @@ -287,7 +288,7 @@ int crypto_sign_verify(const uint8_t *sig, } /* Compute CRH(H(rho, t1), msg) */ - shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256(mu, CRHBYTES, pk, PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_PUBLICKEYBYTES); shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -353,17 +354,17 @@ int crypto_sign_open(uint8_t *m, const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; - if (crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES; + if (crypto_sign_verify(sm, PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM5_AARCH64_CRYPTO_BYTES + i]; } return 0; } diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.h index bc8c42658..fbccb2c76 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/sign.h @@ -1,5 +1,5 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_SIGN_H +#define PQCLEAN_DILITHIUM5_AARCH64_SIGN_H /* * This file is dual licensed @@ -24,7 +24,7 @@ int crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); -#define crypto_sign DILITHIUM_NAMESPACETOP +#define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk); diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric-shake.c b/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric-shake.c index a53074aac..53aab1c94 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric-shake.c +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric-shake.c @@ -4,8 +4,9 @@ * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * diff --git a/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric.h b/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric.h index 40b928ec6..1a2a89d85 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric.h +++ b/Modules/PQClean/crypto_sign/dilithium5/aarch64/symmetric.h @@ -1,13 +1,14 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM5_AARCH64_SYMMETRIC_H +#define PQCLEAN_DILITHIUM5_AARCH64_SYMMETRIC_H /* * This file was originally licensed * under Apache 2.0 (https://www.apache.org/licenses/LICENSE-2.0.html) or * public domain at https://github.com/pq-crystals/dilithium/tree/master/ref * - * We choose - * CC0 1.0 Universal or the following MIT License + * We offer + * CC0 1.0 Universal or the following MIT License for this file. + * You may freely choose one of them that applies. * * MIT License * @@ -33,7 +34,7 @@ */ #include "fips202.h" -#include "fips202x2.h" +#include "keccak2x/fips202x2.h" #include "params.h" #include diff --git a/Modules/PQClean/crypto_sign/dilithium5/clean/sign.c b/Modules/PQClean/crypto_sign/dilithium5/clean/sign.c index 2524c470f..d7a85ebfd 100644 --- a/Modules/PQClean/crypto_sign/dilithium5/clean/sign.c +++ b/Modules/PQClean/crypto_sign/dilithium5/clean/sign.c @@ -337,7 +337,7 @@ int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_open(uint8_t *m, badsig: /* Signature verification failed */ - *mlen = -1; + *mlen = (size_t) -1; for (i = 0; i < smlen; ++i) { m[i] = 0; } diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c index 329753380..a03540d3b 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c b/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c index d97750c09..d2afbb0c4 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c index 329753380..a03540d3b 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-128f-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c index 329753380..a03540d3b 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c b/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c index d97750c09..d2afbb0c4 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c index 329753380..a03540d3b 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-128s-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c b/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c index d97750c09..d2afbb0c4 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-192f-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c b/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c index d97750c09..d2afbb0c4 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-192s-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c b/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c index d97750c09..d2afbb0c4 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-256f-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c b/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c index d97750c09..d2afbb0c4 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/avx2/sha256x8.c @@ -133,7 +133,7 @@ void mgf1x8(unsigned char *outx8, unsigned long outlen, memcpy(inbufx8 + 7 * (inlen + 4), in7, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { for (j = 0; j < 8; j++) { u32_to_bytes(inbufx8 + inlen + j * (inlen + 4), i); } diff --git a/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c b/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c index 5ba5e9cf3..828558f00 100644 --- a/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c +++ b/Modules/PQClean/crypto_sign/sphincs-sha2-256s-simple/clean/hash_sha2.c @@ -31,7 +31,7 @@ void mgf1_256(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA256 output.. */ - for (i = 0; (i + 1)*SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA256_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha256(out, inbuf, inlen + 4); out += SPX_SHA256_OUTPUT_BYTES; @@ -56,7 +56,7 @@ void mgf1_512(unsigned char *out, unsigned long outlen, memcpy(inbuf, in, inlen); /* While we can fit in at least another full block of SHA512 output.. */ - for (i = 0; (i + 1)*SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { + for (i = 0; (i + 1) * SPX_SHA512_OUTPUT_BYTES <= outlen; i++) { u32_to_bytes(inbuf + inlen, i); sha512(out, inbuf, inlen + 4); out += SPX_SHA512_OUTPUT_BYTES; diff --git a/Modules/PQClean/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c b/Modules/PQClean/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c index 89dc9a422..bbe043852 100644 --- a/Modules/PQClean/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c +++ b/Modules/PQClean/crypto_sign/sphincs-shake-128f-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/Modules/PQClean/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c b/Modules/PQClean/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c index 89dc9a422..bbe043852 100644 --- a/Modules/PQClean/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c +++ b/Modules/PQClean/crypto_sign/sphincs-shake-128s-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/Modules/PQClean/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c b/Modules/PQClean/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c index 89dc9a422..bbe043852 100644 --- a/Modules/PQClean/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c +++ b/Modules/PQClean/crypto_sign/sphincs-shake-192f-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/Modules/PQClean/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c b/Modules/PQClean/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c index 89dc9a422..bbe043852 100644 --- a/Modules/PQClean/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c +++ b/Modules/PQClean/crypto_sign/sphincs-shake-192s-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/Modules/PQClean/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c b/Modules/PQClean/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c index 89dc9a422..bbe043852 100644 --- a/Modules/PQClean/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c +++ b/Modules/PQClean/crypto_sign/sphincs-shake-256f-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/Modules/PQClean/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c b/Modules/PQClean/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c index 89dc9a422..bbe043852 100644 --- a/Modules/PQClean/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c +++ b/Modules/PQClean/crypto_sign/sphincs-shake-256s-simple/avx2/thash_shake_simplex4.c @@ -58,9 +58,9 @@ void thashx4(unsigned char *out0, } state[16] = _mm256_set1_epi64x((long long)(0x80ULL << 56)); state[(SPX_N / 8) * (1 + inblocks) + 4] = _mm256_xor_si256( - state[(SPX_N / 8) * (1 + inblocks) + 4], - _mm256_set1_epi64x(0x1f) - ); + state[(SPX_N / 8) * (1 + inblocks) + 4], + _mm256_set1_epi64x(0x1f) + ); for (int i = 17; i < 25; i++) { state[i] = _mm256_set1_epi64x(0); } diff --git a/Modules/PQClean/test/crypto_sign/functest.c b/Modules/PQClean/test/crypto_sign/functest.c index 8850f3ba7..9f82423f0 100644 --- a/Modules/PQClean/test/crypto_sign/functest.c +++ b/Modules/PQClean/test/crypto_sign/functest.c @@ -5,6 +5,9 @@ #include #include #include +#ifdef PQCLEAN_USE_VALGRIND +#include +#endif #ifndef NTESTS #define NTESTS 5 @@ -78,6 +81,15 @@ inline static void *malloc_s(size_t size) { #endif static int test_sign(void) { + /* + * In order to properly test variable-length signatures, we need to check + * that the implementation does not modify the provided buffer beyond the + * reported signature length. We do this by filling the buffer with random + * bytes before the call to sign and checking afterward that the tail has + * not been modified. + */ + uint8_t sm_random_cmp[MLEN + CRYPTO_BYTES]; + /* * This is most likely going to be aligned by the compiler. * 16 extra bytes for canary @@ -124,8 +136,29 @@ static int test_sign(void) { RETURNS_ZERO(crypto_sign_keypair(pk + 8, sk + 8)); randombytes(m + 8, MLEN); + // Fill the sm buffer with random bytes + randombytes(sm_random_cmp, MLEN + CRYPTO_BYTES); + memcpy(sm + 8, sm_random_cmp, MLEN + CRYPTO_BYTES); + +#ifdef PQCLEAN_USE_VALGRIND + /* + * With this buffer marked as undefined, valgrind will detect + * cases where the signing code depends on the value of the tail + * of the buffer. + */ + VALGRIND_MAKE_MEM_UNDEFINED(sm + 8, MLEN + CRYPTO_BYTES); +#endif + RETURNS_ZERO(crypto_sign(sm + 8, &smlen, m + 8, MLEN, sk + 8)); +#ifdef PQCLEAN_USE_VALGRIND + // We have to mark the tail as defined before doing the memcmp. + VALGRIND_MAKE_MEM_DEFINED(sm + 8 + smlen, MLEN + CRYPTO_BYTES - smlen); +#endif + + // check that the tail has not been modified + RETURNS_ZERO(memcmp(sm + 8 + smlen, sm_random_cmp + smlen, MLEN + CRYPTO_BYTES - smlen)); + // By relying on m == sm we prevent having to allocate CRYPTO_BYTES // twice if ((returncode = @@ -157,6 +190,15 @@ static int test_sign(void) { } static int test_sign_detached(void) { + /* + * In order to properly test variable-length signatures, we need to check + * that the implementation does not modify the provided buffer beyond the + * reported signature length. We do this by filling the buffer with random + * bytes before the call to sign and checking afterward that the tail has + * not been modified. + */ + uint8_t sig_random_cmp[CRYPTO_BYTES]; + /* * This is most likely going to be aligned by the compiler. * 16 extra bytes for canary @@ -202,8 +244,30 @@ static int test_sign_detached(void) { RETURNS_ZERO(crypto_sign_keypair(pk + 8, sk + 8)); randombytes(m + 8, MLEN); + + // Fill the sig buffer with random bytes + randombytes(sig_random_cmp, CRYPTO_BYTES); + memcpy(sig + 8, sig_random_cmp, CRYPTO_BYTES); + +#ifdef PQCLEAN_USE_VALGRIND + /* + * With this buffer marked as undefined, valgrind will detect + * cases where the signing code depends on the value of the tail + * of the buffer. + */ + VALGRIND_MAKE_MEM_UNDEFINED(sig + 8, CRYPTO_BYTES); +#endif + RETURNS_ZERO(crypto_sign_signature(sig + 8, &siglen, m + 8, MLEN, sk + 8)); +#ifdef PQCLEAN_USE_VALGRIND + // We have to mark the tail as defined before doing the memcmp. + VALGRIND_MAKE_MEM_DEFINED(sig + 8 + siglen, CRYPTO_BYTES - siglen); +#endif + + // check that the tail has not been modified + RETURNS_ZERO(memcmp(sig + 8 + siglen, sig_random_cmp + siglen, CRYPTO_BYTES - siglen)); + if ((returncode = crypto_sign_verify(sig + 8, siglen, m + 8, MLEN, pk + 8)) != 0) { fprintf(stderr, "ERROR Signature did not verify correctly!\n"); diff --git a/Modules/PQClean/test/duplicate_consistency/dilithium2_aarch64.yml b/Modules/PQClean/test/duplicate_consistency/dilithium2_aarch64.yml index 2d16a72a8..30907666a 100644 --- a/Modules/PQClean/test/duplicate_consistency/dilithium2_aarch64.yml +++ b/Modules/PQClean/test/duplicate_consistency/dilithium2_aarch64.yml @@ -6,8 +6,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h @@ -32,8 +30,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h diff --git a/Modules/PQClean/test/duplicate_consistency/dilithium3_aarch64.yml b/Modules/PQClean/test/duplicate_consistency/dilithium3_aarch64.yml index 09afb1ca1..4723c4ee6 100644 --- a/Modules/PQClean/test/duplicate_consistency/dilithium3_aarch64.yml +++ b/Modules/PQClean/test/duplicate_consistency/dilithium3_aarch64.yml @@ -6,8 +6,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h @@ -32,8 +30,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h diff --git a/Modules/PQClean/test/duplicate_consistency/dilithium5_aarch64.yml b/Modules/PQClean/test/duplicate_consistency/dilithium5_aarch64.yml index 486a4ebaf..29941f2e2 100644 --- a/Modules/PQClean/test/duplicate_consistency/dilithium5_aarch64.yml +++ b/Modules/PQClean/test/duplicate_consistency/dilithium5_aarch64.yml @@ -6,8 +6,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h @@ -32,8 +30,6 @@ consistency_checks: - __asm_iNTT.S - __asm_NTT.S - __asm_poly.S - - fips202x2.c - - fips202x2.h - macros_common.inc - macros.inc - NTT_params.h diff --git a/Modules/PQClean/test/duplicate_consistency/kyber1024_aarch64.yml b/Modules/PQClean/test/duplicate_consistency/kyber1024_aarch64.yml index f356ea571..09ffc25e7 100644 --- a/Modules/PQClean/test/duplicate_consistency/kyber1024_aarch64.yml +++ b/Modules/PQClean/test/duplicate_consistency/kyber1024_aarch64.yml @@ -7,8 +7,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc @@ -39,8 +37,6 @@ consistency_checks: - __asm_poly.S - cbd.c - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc diff --git a/Modules/PQClean/test/duplicate_consistency/kyber512_aarch64.yml b/Modules/PQClean/test/duplicate_consistency/kyber512_aarch64.yml index 1972d144b..731a47284 100644 --- a/Modules/PQClean/test/duplicate_consistency/kyber512_aarch64.yml +++ b/Modules/PQClean/test/duplicate_consistency/kyber512_aarch64.yml @@ -7,8 +7,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc @@ -40,8 +38,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc diff --git a/Modules/PQClean/test/duplicate_consistency/kyber768_aarch64.yml b/Modules/PQClean/test/duplicate_consistency/kyber768_aarch64.yml index 4578fc681..fb8300265 100644 --- a/Modules/PQClean/test/duplicate_consistency/kyber768_aarch64.yml +++ b/Modules/PQClean/test/duplicate_consistency/kyber768_aarch64.yml @@ -8,8 +8,6 @@ consistency_checks: - __asm_NTT.S - __asm_poly.S - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc @@ -42,8 +40,6 @@ consistency_checks: - __asm_poly.S - cbd.c - cbd.h - - fips202x2.c - - fips202x2.h - indcpa.h - kem.c - macros.inc diff --git a/Modules/PQClean/test/test_format.py b/Modules/PQClean/test/test_format.py index b07a23e89..474bfad50 100644 --- a/Modules/PQClean/test/test_format.py +++ b/Modules/PQClean/test/test_format.py @@ -1,6 +1,7 @@ import os import platform import pytest +import functools import helpers import pqclean @@ -10,18 +11,34 @@ if platform.machine() == "ppc": pytest.skip("Skipping this test on PowerPC to save cycles.", allow_module_level=True) + +__astyle_version_result = None + +def _get_astyle_version() -> str: + """Get the AStyle version number""" + # functools.lru_cache doesn't work because we want to print to stdout each time. + global __astyle_version_result + + if __astyle_version_result is None: + __astyle_version_result = helpers.run_subprocess(['astyle', '--version']) + else: + print(__astyle_version_result) + return __astyle_version_result + + helpers.ensure_available('astyle') # Check AStyle version def version_check(): - result = helpers.run_subprocess(['astyle', '--version']) + result = _get_astyle_version() if ("Artistic Style Version 3.4.8" in result or - "Artistic Style Version 3.4.9" in result): + "Artistic Style Version 3.4.9" in result or + "Artistic Style Version 3.4.10" in result): return False return "Artistic Style Version 3.4" in result if not version_check() and "CI" not in os.environ: - pytest.skip("Incompatible AStyle version (need 3.4.x) (not 3.4.{8,9})", allow_module_level=True) + pytest.skip("Incompatible AStyle version (need 3.4.x) (not 3.4.{8-10})", allow_module_level=True) @pytest.mark.parametrize( 'implementation', @@ -32,6 +49,8 @@ def version_check(): def test_format(implementation: pqclean.Implementation): cfiles = implementation.cfiles() hfiles = implementation.hfiles() + # Triggers a print + _get_astyle_version() result = helpers.run_subprocess( ['astyle', '--dry-run', diff --git a/Modules/PQClean/test/test_symbol_namespace.py b/Modules/PQClean/test/test_symbol_namespace.py index cf52833d5..7bfa889ea 100644 --- a/Modules/PQClean/test/test_symbol_namespace.py +++ b/Modules/PQClean/test/test_symbol_namespace.py @@ -48,6 +48,10 @@ def test_symbol_namespaces(implementation, impl_path, test_dir, init, destr): symbol.startswith('_KeccakF1600times4') or # MacOS symbol.startswith('KeccakP1600times4') or symbol.startswith('_KeccakP1600times4') or # MacOS + # shake2x + symbol.lstrip('_').startswith('f1600x2') or + symbol.lstrip('_').startswith('shake128x2') or + symbol.lstrip('_').startswith('shake256x2') or # weird things on i386 symbol.startswith('__x86.get_pc_thunk.')): non_namespaced.append(symbol) diff --git a/Modules/PQClean/test/test_valgrind.py b/Modules/PQClean/test/test_valgrind.py index e2d17fb97..24b59873e 100644 --- a/Modules/PQClean/test/test_valgrind.py +++ b/Modules/PQClean/test/test_valgrind.py @@ -45,7 +45,7 @@ def test_valgrind(implementation: pqclean.Implementation, impl_path, test_dir, SCHEME_DIR=os.path.abspath(impl_path), IMPLEMENTATION=implementation.name, DEST_DIR=dest_dir, - EXTRAFLAGS="-gdwarf-4", + EXTRAFLAGS="-gdwarf-4 -DPQCLEAN_USE_VALGRIND", NTESTS=1, working_dir=os.path.join(test_dir, 'test')) functest_name = './functest_{}_{}'.format(implementation.scheme.name, diff --git a/cffi_modules/_common_cffi_maker.py b/cffi_modules/_common_cffi_maker.py index dc62976b8..1c1367c37 100644 --- a/cffi_modules/_common_cffi_maker.py +++ b/cffi_modules/_common_cffi_maker.py @@ -6,19 +6,19 @@ import re import warnings -from pqc._util import partition_list, map_immed, extant_with_other_suffix, patent_warning +from pqc._util import partition_list, map_immed, extant_with_other_suffix _NAMESPACE_RE = re.compile(r'(?ms)^#define\s+(CRYPTO_NAMESPACE)\s*\(\s*(\w+)\s*\)\s+(\w+)\s*##\s*\2\s*$') def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, common_sources=frozenset(), - parent_module='pqc._lib', - patent_info=None): + parent_module='pqc._lib'): # 0. local variables # build_root = Path(build_root) makefile_parsed = parse_makefile(build_root / 'Makefile') + cflag_makefile_parsed = makefile_parsed if platform.system() != 'Windows' else parse_makefile(build_root / 'Makefile.microsoft_nmake') common_dir = build_root / '..' / '..' / '..' / 'common' _lib_name = Path(makefile_parsed['LIB']).stem lib_name = _lib_name.replace('-', '_') @@ -26,9 +26,6 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # 1. module_name # module_name = f'{parent_module}.{lib_name}' - if patent_info is not None: - patent_message = patent_warning(lib_name, patent_info) - warnings.warn(patent_message) # 2. cdefs, c_header_sources # @@ -81,21 +78,51 @@ def make_pqclean_ffi(build_root, c_header_sources, cdefs, *, # 4. included_ffis, extra_compile_args, libraries, include_dirs # included_ffis = [] + extra_compile_args = cflag_makefile_parsed['CFLAGS'].split() + include_dirs = [build_root] + libraries = [] - extra_compile_args = [] + # Modifications + + # * Move "include" flags to setuptools + _to_pop = [] + for i, arg in enumerate(extra_compile_args): + if arg.startswith('-I'): + include_dirs.append(build_root / arg[2:]) + _to_pop.extend([i]) + if arg.startswith('/I'): + include_dirs.append(build_root / extra_compile_args[i+1]) + _to_pop.extend([i, i+1]) + map_immed(extra_compile_args.pop, reversed(_to_pop)) + + # * FIXME don't make errors fatal + _to_pop = [] + for i, arg in enumerate(extra_compile_args): + if arg.startswith('-Werror'): + _to_pop.extend([i]) + if arg == '/WX': + _to_pop.extend([i]) + map_immed(extra_compile_args.pop, reversed(_to_pop)) + + # * Other Windows compiler fixes if platform.system() == 'Windows': # https://foss.heptapod.net/pypy/cffi/-/issues/516 # https://www.reddit.com/r/learnpython/comments/175js2u/def_extern_says_im_not_using_it_in_api_mode/ # https://learn.microsoft.com/en-us/cpp/build/reference/tc-tp-tc-tp-specify-source-file-type?view=msvc-170 extra_compile_args.append('/TC') - libraries = [] - if platform.system() == 'Windows': # https://stackoverflow.com/questions/69900013/link-error-cannot-build-python-c-extension-in-windows # https://learn.microsoft.com/en-us/windows/win32/seccrypto/required-libraries libraries.append('Advapi32') - include_dirs = [(build_root), (common_dir)] + # * Other Mac OS compiler fixes + if platform.system() == 'Darwin': + # https://github.com/JamesTheAwesomeDude/pypqc/issues/9 + # https://github.com/actions/runner-images/issues/1938 + extra_compile_args.extend([ + '-Wno-error=implicit-function-declaration', + '-Wno-error=macro-redefined', + ]) # 5. create, return # diff --git a/cffi_modules/_falcon_cffi_maker.py b/cffi_modules/_falcon_cffi_maker.py index ed3febd70..48719eb15 100644 --- a/cffi_modules/_falcon_cffi_maker.py +++ b/cffi_modules/_falcon_cffi_maker.py @@ -5,10 +5,4 @@ def make_falcon_ffi(build_root): common_sources = ['fips202.c', 'randombytes.c'] - patent_info = ( - 2, - ['US7308097B2'], [ - 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] - ) - - return make_sign_ffi(build_root=build_root, common_sources=common_sources, patent_info=patent_info) + return make_sign_ffi(build_root=build_root, common_sources=common_sources) diff --git a/cffi_modules/_hqc_cffi_maker.py b/cffi_modules/_hqc_cffi_maker.py index cb29883dd..93eb79519 100644 --- a/cffi_modules/_hqc_cffi_maker.py +++ b/cffi_modules/_hqc_cffi_maker.py @@ -4,10 +4,4 @@ def make_hqc_ffi(build_root): common_sources = ['fips202.c', 'randombytes.c'] - patent_info = ( - 3, [ - 'FR2956541B1/US9094189B2/EP2537284B1',], [ - 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] - ) - - return make_kem_ffi(build_root=build_root, common_sources=common_sources, patent_info=patent_info) + return make_kem_ffi(build_root=build_root, common_sources=common_sources) diff --git a/cffi_modules/_kyber_cffi_maker.py b/cffi_modules/_kyber_cffi_maker.py index 878a8289d..d4279be3f 100644 --- a/cffi_modules/_kyber_cffi_maker.py +++ b/cffi_modules/_kyber_cffi_maker.py @@ -4,18 +4,6 @@ def make_kyber_ffi(build_root): common_sources = ['fips202.c', 'randombytes.c'] - patent_info = ( - 1,[ - 'FR2956541A1/US9094189B2/EP2537284B1', - 'US9246675/EP2837128B1', - 'potential unknown others'], [ - 'https://ntruprime.cr.yp.to/faq.html', - 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', - 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', - 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', - 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] - ) - extra_cdefs = [dedent("""\ // Exposed internal interface void %(namespace)sindcpa_enc(uint8_t *c, const uint8_t *m, const uint8_t *pk, const uint8_t *coins); @@ -27,4 +15,4 @@ def make_kyber_ffi(build_root): #include "indcpa.h" """)] - return make_kem_ffi(build_root=build_root, extra_c_header_sources=extra_c_header_sources, extra_cdefs=extra_cdefs, common_sources=common_sources, patent_info=patent_info) + return make_kem_ffi(build_root=build_root, extra_c_header_sources=extra_c_header_sources, extra_cdefs=extra_cdefs, common_sources=common_sources) diff --git a/pqc/_util.py b/pqc/_util.py index 6c9b8e002..1360b342d 100644 --- a/pqc/_util.py +++ b/pqc/_util.py @@ -1,8 +1,5 @@ from collections import deque -from functools import partial from itertools import starmap -from pathlib import Path -import platform import re from textwrap import dedent from warnings import warn @@ -32,14 +29,12 @@ def map_immed(f, it, *, splat=False): deque((map if not splat else starmap)(f, it), 0) -def patent_warning(subject, patent_info): - severity, patents, links = patent_info - +def patent_notice(patents, subject, severity, links, stacklevel=0): if severity == 0: - return None + return if severity == 1: - return dedent(f"""\ + warn(dedent(f"""\ {subject} may be protected under patent(s) {'; '.join(patents)}. If you rely on this library via PyPI, it could break at any time if I'm forced by the patentholders to remove this module. Additionally, the patentholders might impose on you *additional* terms, beyond those stated in the software's license. @@ -47,10 +42,12 @@ def patent_warning(subject, patent_info): This is not legal advice. For more information, see: """) + '\n'.join(links) + dedent(f""" - If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it.""") + If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it."""), + stacklevel=2+stacklevel) + return if severity == 2: - return dedent(f"""\ + warn(dedent(f"""\ {subject} may be protected under patent(s) {'; '.join(patents)}. ITS LICENSING STATUS FOR PUBLIC USE IS DISPUTED OR UNKNOWN AT THIS TIME. If you rely on this library via PyPI, it could break at any time if I'm forced by the patentholders to remove this module. @@ -59,10 +56,12 @@ def patent_warning(subject, patent_info): This is not legal advice. For more information, see: """) + '\n'.join(links) + dedent(f""" - If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it.""") + If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it."""), + stacklevel=2+stacklevel) + return if severity == 3: - return dedent(f"""\ + warn(dedent(f"""\ {subject} may be protected under patent(s) {'; '.join(patents)}. IT MIGHT NOT BE LICENSED FOR PUBLIC USE AT THIS TIME. If you rely on this library via PyPI, it could break at any time if I'm forced by the patentholders to remove this module. @@ -71,6 +70,8 @@ def patent_warning(subject, patent_info): This is not legal advice. For more information, see: """) + '\n'.join(links) + dedent(f""" - If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it.""") + If the continued use of {subject} is important to you, consider hiring a lawyer and/or purchasing a license for it."""), + stacklevel=2+stacklevel) + return raise ValueError(f'severity = {severity}') diff --git a/pqc/demo.py b/pqc/demo.py index edefb3111..59bff24a7 100644 --- a/pqc/demo.py +++ b/pqc/demo.py @@ -6,5 +6,5 @@ test_decrypted = mceliece6960119.kem_dec(test_ciphertext, secret_key) if test_key != test_decrypted: - raise AssertionError("fail :(") - print("OK") + raise AssertionError('fail :(') + print('OK') diff --git a/pqc/kem/__init__.py b/pqc/kem/__init__.py deleted file mode 100644 index eb00ebe40..000000000 --- a/pqc/kem/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -__all__ = [ - 'mceliece348864', - 'mceliece460896', - 'mceliece6688128', - 'mceliece6960119', - 'mceliece8192128'] diff --git a/pqc/kem/hqc_128.py b/pqc/kem/hqc_128.py index d4ea4355e..b000e370d 100644 --- a/pqc/kem/hqc_128.py +++ b/pqc/kem/hqc_128.py @@ -1,6 +1,17 @@ from .._lib.libhqc_128_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +import os + +if os.environ.get('LICENSED_HQC', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + ) + # fmt: on + +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -14,38 +25,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + errno = _crypto_kem_enc(_ct, _ss, _pk) - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/hqc_192.py b/pqc/kem/hqc_192.py index e3ed0c9e8..dda202f88 100644 --- a/pqc/kem/hqc_192.py +++ b/pqc/kem/hqc_192.py @@ -1,6 +1,17 @@ from .._lib.libhqc_192_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +import os + +if os.environ.get('LICENSED_HQC', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + ) + # fmt: on + +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -14,38 +25,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + errno = _crypto_kem_enc(_ct, _ss, _pk) - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/hqc_256.py b/pqc/kem/hqc_256.py index 309ffec94..578a6965c 100644 --- a/pqc/kem/hqc_256.py +++ b/pqc/kem/hqc_256.py @@ -1,6 +1,17 @@ from .._lib.libhqc_256_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +import os + +if os.environ.get('LICENSED_HQC', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['FR2956541B1/US9094189B2/EP2537284B1'], + 'the HQC cryptosystem', 3, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/round-4/final-ip-statements/HQC-Statements-Round4.pdf'] + ) + # fmt: on + +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -14,38 +25,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + errno = _crypto_kem_enc(_ct, _ss, _pk) - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/kyber1024.py b/pqc/kem/kyber1024.py index 7ecc10e12..e37a8d59f 100644 --- a/pqc/kem/kyber1024.py +++ b/pqc/kem/kyber1024.py @@ -1,6 +1,21 @@ from .._lib.libkyber1024_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +import os + +if os.environ.get('LICENSED_KYBER', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], + 'the Kyber cryptosystem', 1, [ + 'https://ntruprime.cr.yp.to/faq.html', + 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', + 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', + 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', + 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] + ) + # fmt: on + +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -12,43 +27,37 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_indcpa_enc = getattr(lib, f'{_LIB_NAMESPACE}indcpa_enc') -_indcpa_dec = getattr(lib, f'{_LIB_NAMESPACE}indcpa_dec') - def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + errno = _crypto_kem_enc(_ct, _ss, _pk) - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/kyber512.py b/pqc/kem/kyber512.py index 945e1049a..8103ae69f 100644 --- a/pqc/kem/kyber512.py +++ b/pqc/kem/kyber512.py @@ -1,6 +1,21 @@ from .._lib.libkyber512_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +import os + +if os.environ.get('LICENSED_KYBER', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], + 'the Kyber cryptosystem', 1, [ + 'https://ntruprime.cr.yp.to/faq.html', + 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', + 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', + 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', + 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] + ) + # fmt: on + +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -12,43 +27,37 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_indcpa_enc = getattr(lib, f'{_LIB_NAMESPACE}indcpa_enc') -_indcpa_dec = getattr(lib, f'{_LIB_NAMESPACE}indcpa_dec') - def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + errno = _crypto_kem_enc(_ct, _ss, _pk) - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/kyber768.py b/pqc/kem/kyber768.py index 01282a541..50da301bb 100644 --- a/pqc/kem/kyber768.py +++ b/pqc/kem/kyber768.py @@ -1,6 +1,21 @@ from .._lib.libkyber768_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +import os + +if os.environ.get('LICENSED_KYBER', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['FR2956541A1/US9094189B2/EP2537284B1', 'US9246675/EP2837128B1', 'potential unknown others'], + 'the Kyber cryptosystem', 1, [ + 'https://ntruprime.cr.yp.to/faq.html', + 'https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/nist-pqc-license-summary-and-excerpts.pdf', + 'https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/G0DoD7lkGPk/m/d7Zw0qhGBwAJ', + 'https://datatracker.ietf.org/meeting/116/proceedings#pquip:~:text=Patents%20and%20PQC', + 'https://mailarchive.ietf.org/arch/msg/pqc/MS92cuZkSRCDEjpPP90s2uAcRPo/'] + ) + # fmt: on + +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -12,43 +27,37 @@ _crypto_kem_enc = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_enc') _crypto_kem_dec = getattr(lib, f'{_LIB_NAMESPACE}crypto_kem_dec') -_indcpa_enc = getattr(lib, f'{_LIB_NAMESPACE}indcpa_enc') -_indcpa_dec = getattr(lib, f'{_LIB_NAMESPACE}indcpa_dec') - def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") + errno = _crypto_kem_enc(_ct, _ss, _pk) - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece348864.py b/pqc/kem/mceliece348864.py index bb92803a0..fccf2e0f5 100644 --- a/pqc/kem/mceliece348864.py +++ b/pqc/kem/mceliece348864.py @@ -1,6 +1,6 @@ from .._lib.libmceliece348864f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,38 +18,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece460896.py b/pqc/kem/mceliece460896.py index 0fdded864..2c6886176 100644 --- a/pqc/kem/mceliece460896.py +++ b/pqc/kem/mceliece460896.py @@ -1,6 +1,6 @@ from .._lib.libmceliece460896f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,38 +18,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece6688128.py b/pqc/kem/mceliece6688128.py index 30e1d1826..261e6c9c8 100644 --- a/pqc/kem/mceliece6688128.py +++ b/pqc/kem/mceliece6688128.py @@ -1,6 +1,6 @@ -from .._lib.libmceliece66881128f_clean import ffi, lib +from .._lib.libmceliece6688128f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,38 +18,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece6960119.py b/pqc/kem/mceliece6960119.py index 4a75143fb..62bcac439 100644 --- a/pqc/kem/mceliece6960119.py +++ b/pqc/kem/mceliece6960119.py @@ -1,6 +1,6 @@ from .._lib.libmceliece6960119f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,38 +18,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/kem/mceliece8192128.py b/pqc/kem/mceliece8192128.py index c5ee13607..edf078dfb 100644 --- a/pqc/kem/mceliece8192128.py +++ b/pqc/kem/mceliece8192128.py @@ -1,6 +1,6 @@ from .._lib.libmceliece8192128f_clean import ffi, lib -__all__ = ['kem_keypair', 'kem_enc', 'kem_dec'] +__all__ = ['keypair', 'encap', 'decap'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,38 +18,35 @@ def keypair(): - pk = ffi.new(_T_PUBLICKEY) - sk = ffi.new(_T_SECRETKEY) + _pk = ffi.new(_T_PUBLICKEY) + _sk = ffi.new(_T_SECRETKEY) - errno = _crypto_kem_keypair(ffi.cast('char*', pk), ffi.cast('char*', sk)) + errno = _crypto_kem_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_keypair returned error code {errno}") - return bytes(pk), bytes(sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_kem_keypair.__name__} returned error code {errno}') def encap(pk): - ciphertext = ffi.new(_T_KEM_CIPHERTEXT) - key = ffi.new(_T_KEM_PLAINTEXT) - pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) + _ct = ffi.new(_T_KEM_CIPHERTEXT) + _ss = ffi.new(_T_KEM_PLAINTEXT) + _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) - errno = _crypto_kem_enc(ffi.cast('char*', ciphertext), ffi.cast('char*', key), ffi.cast('char*', pk)) + errno = _crypto_kem_enc(_ct, _ss, _pk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_enc returned error code {errno}") - - return bytes(key), bytes(ciphertext) + if errno == 0: + return bytes(_ss), bytes(_ct) + raise RuntimeError(f'{_crypto_kem_enc.__name__} returned error code {errno}') def decap(ciphertext, sk): - key = ffi.new(_T_KEM_PLAINTEXT) - ciphertext = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) - sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - - errno = _crypto_kem_dec(ffi.cast('char*', key), ffi.cast('char*', ciphertext), ffi.cast('char*', sk)) - - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_kem_dec returned error code {errno}") + _ss = ffi.new(_T_KEM_PLAINTEXT) + _ct = ffi.cast(_T_KEM_CIPHERTEXT, ffi.from_buffer(ciphertext)) + _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - return bytes(key) + errno = _crypto_kem_dec(_ss, _ct, _sk) + if errno == 0: + return bytes(_ss) + raise RuntimeError(f'{_crypto_kem_dec.__name__} returned error code {errno}') diff --git a/pqc/sign/dilithium2.py b/pqc/sign/dilithium2.py index 7193d8aeb..b3a434332 100644 --- a/pqc/sign/dilithium2.py +++ b/pqc/sign/dilithium2.py @@ -1,6 +1,6 @@ from .._lib.libdilithium2_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/dilithium3.py b/pqc/sign/dilithium3.py index a47f2fefc..1ee2044ae 100644 --- a/pqc/sign/dilithium3.py +++ b/pqc/sign/dilithium3.py @@ -1,6 +1,6 @@ from .._lib.libdilithium3_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/dilithium5.py b/pqc/sign/dilithium5.py index d8f1d321a..c90a69483 100644 --- a/pqc/sign/dilithium5.py +++ b/pqc/sign/dilithium5.py @@ -1,6 +1,6 @@ from .._lib.libdilithium5_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/falcon_1024.py b/pqc/sign/falcon_1024.py index 81168bf87..c98ef67ac 100644 --- a/pqc/sign/falcon_1024.py +++ b/pqc/sign/falcon_1024.py @@ -1,6 +1,17 @@ from .._lib.libfalcon_1024_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +import os + +if os.environ.get('LICENSED_FALCON', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['US7308097B2'], + 'the Falcon cryptosystem', 2, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] + ) + # fmt: on + +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -10,7 +21,6 @@ _crypto_sign_keypair = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_keypair') _crypto_sign_signature = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_signature') _crypto_sign_verify = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_verify') -_SIGNATURE_MAXLEN = getattr(lib, f'{_LIB_NAMESPACE}CRYPTO_BYTES') def keypair(): @@ -19,43 +29,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sigbuf = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - _sig = _sigbuf[0:_siglen[0]] # Non-copying slice operation - return bytes(_sig) + if errno == 0: + _sig = _sigbuf[0 : _siglen[0]] # Variable-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.from_buffer(sig) - assert len(_sig) <= _SIGNATURE_MAXLEN - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/falcon_512.py b/pqc/sign/falcon_512.py index 485575952..c658e9a6f 100644 --- a/pqc/sign/falcon_512.py +++ b/pqc/sign/falcon_512.py @@ -1,6 +1,17 @@ from .._lib.libfalcon_512_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +import os + +if os.environ.get('LICENSED_FALCON', '0') == '0': + # fmt: off + from .._util import patent_notice + patent_notice(['US7308097B2'], + 'the Falcon cryptosystem', 2, + ['https://csrc.nist.gov/csrc/media/Projects/post-quantum-cryptography/documents/selected-algos-2022/final-ip-statements/Falcon-Statements-final.pdf#page=20'] + ) + # fmt: on + +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -10,7 +21,6 @@ _crypto_sign_keypair = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_keypair') _crypto_sign_signature = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_signature') _crypto_sign_verify = getattr(lib, f'{_LIB_NAMESPACE}crypto_sign_verify') -_SIGNATURE_MAXLEN = getattr(lib, f'{_LIB_NAMESPACE}CRYPTO_BYTES') def keypair(): @@ -19,43 +29,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sigbuf = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sigbuf, _siglen, _m, len(m), _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - _sig = _sigbuf[0:_siglen[0]] # Non-copying slice operation - return bytes(_sig) + if errno == 0: + _sig = _sigbuf[0 : _siglen[0]] # Variable-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.from_buffer(sig) - assert len(_sig) <= _SIGNATURE_MAXLEN - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_128f_simple.py b/pqc/sign/sphincs_sha2_128f_simple.py index 910ad9024..0a1c4b0c3 100644 --- a/pqc/sign/sphincs_sha2_128f_simple.py +++ b/pqc/sign/sphincs_sha2_128f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_128f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_128s_simple.py b/pqc/sign/sphincs_sha2_128s_simple.py index 7407f7999..b1b36ae57 100644 --- a/pqc/sign/sphincs_sha2_128s_simple.py +++ b/pqc/sign/sphincs_sha2_128s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_128s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_192f_simple.py b/pqc/sign/sphincs_sha2_192f_simple.py index 4fd403e05..0ff057d30 100644 --- a/pqc/sign/sphincs_sha2_192f_simple.py +++ b/pqc/sign/sphincs_sha2_192f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_192f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_192s_simple.py b/pqc/sign/sphincs_sha2_192s_simple.py index d51d23c42..8040aa556 100644 --- a/pqc/sign/sphincs_sha2_192s_simple.py +++ b/pqc/sign/sphincs_sha2_192s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_192s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_256f_simple.py b/pqc/sign/sphincs_sha2_256f_simple.py index d09e4b272..69aeb17ed 100644 --- a/pqc/sign/sphincs_sha2_256f_simple.py +++ b/pqc/sign/sphincs_sha2_256f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_256f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_sha2_256s_simple.py b/pqc/sign/sphincs_sha2_256s_simple.py index 681d45246..b4ce53bb9 100644 --- a/pqc/sign/sphincs_sha2_256s_simple.py +++ b/pqc/sign/sphincs_sha2_256s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_sha2_256s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_128f_simple.py b/pqc/sign/sphincs_shake_128f_simple.py index d689d4348..da90937d1 100644 --- a/pqc/sign/sphincs_shake_128f_simple.py +++ b/pqc/sign/sphincs_shake_128f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_128f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_128s_simple.py b/pqc/sign/sphincs_shake_128s_simple.py index 6f67854e9..8d2b726d0 100644 --- a/pqc/sign/sphincs_shake_128s_simple.py +++ b/pqc/sign/sphincs_shake_128s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_128s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_192f_simple.py b/pqc/sign/sphincs_shake_192f_simple.py index b8284828d..39287aff1 100644 --- a/pqc/sign/sphincs_shake_192f_simple.py +++ b/pqc/sign/sphincs_shake_192f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_192f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_192s_simple.py b/pqc/sign/sphincs_shake_192s_simple.py index a381f1123..6c70b8c34 100644 --- a/pqc/sign/sphincs_shake_192s_simple.py +++ b/pqc/sign/sphincs_shake_192s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_192s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_256f_simple.py b/pqc/sign/sphincs_shake_256f_simple.py index 72b7a2f14..368e5c00f 100644 --- a/pqc/sign/sphincs_shake_256f_simple.py +++ b/pqc/sign/sphincs_shake_256f_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_256f_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pqc/sign/sphincs_shake_256s_simple.py b/pqc/sign/sphincs_shake_256s_simple.py index 3e12ebfef..16c313369 100644 --- a/pqc/sign/sphincs_shake_256s_simple.py +++ b/pqc/sign/sphincs_shake_256s_simple.py @@ -1,6 +1,6 @@ from .._lib.libsphincs_shake_256s_simple_clean import ffi, lib -__all__ = ['keypair', 'signature', 'verify'] +__all__ = ['keypair', 'sign', 'verify'] _LIB_NAMESPACE = ffi.string(lib._NAMESPACE).decode('ascii') _T_PUBLICKEY = f'{_LIB_NAMESPACE}crypto_publickey' @@ -18,42 +18,34 @@ def keypair(): errno = _crypto_sign_keypair(_pk, _sk) - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_keypair returned error code {errno}") - return bytes(_pk), bytes(_sk) + if errno == 0: + return bytes(_pk), bytes(_sk) + raise RuntimeError(f'{_crypto_sign_keypair.__name__} returned error code {errno}') -def signature(m, sk): +def sign(m, sk): _m = ffi.from_buffer(m) - _sk = ffi.cast(_T_SECRETKEY, ffi.from_buffer(sk)) - assert len(_sk) == len(sk) - _sig = ffi.new(_T_SIGNATURE) _siglen = ffi.new('size_t*') errno = _crypto_sign_signature(_sig, _siglen, _m, len(m), _sk) - assert len(_sig) == _siglen[0] # This is a fixed parameter; WHY is it output?? - if errno: - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_signature returned error code {errno}") - - return bytes(_sig) + if errno == 0: + assert len(_sig) == _siglen[0] # Fixed-length signature + return bytes(_sig) + raise RuntimeError(f'{_crypto_sign_signature.__name__} returned error code {errno}') def verify(sig, m, pk): _sig = ffi.cast(_T_SIGNATURE, ffi.from_buffer(sig)) - _m = ffi.from_buffer(m) - _pk = ffi.cast(_T_PUBLICKEY, ffi.from_buffer(pk)) errno = _crypto_sign_verify(_sig, len(_sig), _m, len(_m), _pk) - if errno: - if errno == -1: - raise ValueError('verification failed') - raise RuntimeError(f"{_LIB_NAMESPACE}crypto_sign_verify returned error code {errno}") - - return - + if errno == 0: + return + if errno == -1: + raise ValueError('verification failed') + raise RuntimeError(f'{_crypto_sign_verify.__name__} returned error code {errno}') diff --git a/pyproject.toml b/pyproject.toml index 0672548b3..6b9df9512 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ [project] name = "pypqc" -version = "0.0.6.1.post1" +version = "0.0.6.2" description = "Python bindings for the \"PQClean\" post-quantum cryptography library." readme = "README.rst" urls = {"Homepage" = "https://github.com/JamesTheAwesomeDude/pypqc"} dependencies = [ - 'cffi >= 1.0.0', + 'cffi >= 1.0.0;platform_python_implementation != "PyPy"', ] [tool.setuptools.packages.find] @@ -16,5 +16,9 @@ namespaces = false requires = [ 'cffi >= 1.14.5', 'setuptools >= 49.5.0', - 'wheel >= 0.38.0', + 'wheel >= 0.30.0', ] + +[tool.ruff.format] +quote-style = "single" +indent-style = "tab" diff --git a/requirements-dev.txt b/requirements-dev.txt index 14f55940f..d6f2b6998 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,6 @@ -pip >= 18.0 -cffi >= 1.15.0 +cffi >= 1.14.5 setuptools >= 49.5.0 +wheel >= 0.30.0 +# Beyond "build" requirements +build >= 0.6.0 +twine >= 1.15.0 diff --git a/setup.py b/setup.py index 3035526b6..fbc8b8d9d 100644 --- a/setup.py +++ b/setup.py @@ -1,55 +1,88 @@ # https://foss.heptapod.net/pypy/cffi/-/issues/441 # https://github.com/pypa/setuptools/issues/1040 +import platform from setuptools import setup -from distutils.command.build_ext import build_ext as _build_ext +import sys from wheel.bdist_wheel import bdist_wheel as _bdist_wheel +# Pending https://hpyproject.org/ +ABI3_EXCLUDE_IMPLEMENTATIONS = { + 'PyPy', # https://github.com/orgs/pypy/discussions/4884#discussioncomment-8309845 +} -class bdist_wheel_abi_none(_bdist_wheel): - """https://github.com/joerick/python-ctypes-package-sample/blob/7db688cd6ee32ae95bce0f75fb7d806926e20252/setup.py#L29""" - def finalize_options(self): - _bdist_wheel.finalize_options(self) - self.root_is_pure = False - def get_tag(self): - python, abi, plat = _bdist_wheel.get_tag(self) - return "py3", "none", plat +class site_bdist_wheel(_bdist_wheel): + """https://github.com/joerick/python-ctypes-package-sample/blob/7db688cd6ee32ae95bce0f75fb7d806926e20252/setup.py#L29""" + + def finalize_options(self): + # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L244 + if ( + platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS + # https://github.com/pypa/wheel/blob/0.42.0/src/wheel/bdist_wheel.py#L267 + and ( + self.distribution.has_ext_modules() + or self.distribution.has_c_libraries() + ) + # https://github.com/pypa/setuptools/blob/v69.0.3/setuptools/command/build_ext.py#L160 + and all(ext.py_limited_api for ext in self.distribution.ext_modules) + ): + self.py_limited_api = ( + f'cp{sys.version_info.major}{sys.version_info.minor}' + if platform.python_implementation() == 'CPython' + else f'py{sys.version_info.major}{sys.version_info.minor}' + ) + super().finalize_options() + + def get_tag(self): + python, abi, plat = _bdist_wheel.get_tag(self) + if ( + self.py_limited_api + and platform.python_implementation() not in ABI3_EXCLUDE_IMPLEMENTATIONS + ): + # https://github.com/pypa/cibuildwheel/blob/v2.16.3/cibuildwheel/util.py#L653 + python = ( + f'cp{sys.version_info.major}{sys.version_info.minor}' + if platform.python_implementation() == 'CPython' + else f'py{sys.version_info.major}{sys.version_info.minor}' + ) + abi = f'abi{sys.version_info.major}' + return python, abi, plat setup( - cmdclass={"bdist_wheel": bdist_wheel_abi_none}, - cffi_modules=[ - 'cffi_modules/dilithium2_clean.py:ffi', - 'cffi_modules/dilithium3_clean.py:ffi', - 'cffi_modules/dilithium5_clean.py:ffi', - 'cffi_modules/falcon_512_clean.py:ffi', - 'cffi_modules/falcon_1024_clean.py:ffi', - 'cffi_modules/hqc_128_clean.py:ffi', - 'cffi_modules/hqc_192_clean.py:ffi', - 'cffi_modules/hqc_256_clean.py:ffi', - 'cffi_modules/kyber512_clean.py:ffi', - 'cffi_modules/kyber768_clean.py:ffi', - 'cffi_modules/kyber1024_clean.py:ffi', - 'cffi_modules/mceliece348864f_clean.py:ffi', - 'cffi_modules/mceliece460896f_clean.py:ffi', - 'cffi_modules/mceliece6688128f_clean.py:ffi', - 'cffi_modules/mceliece6960119f_clean.py:ffi', - 'cffi_modules/mceliece8192128f_clean.py:ffi', -# 'cffi_modules/mceliece6688128pcf_clean.py:ffi', -# 'cffi_modules/mceliece6960119pcf_clean.py:ffi', -# 'cffi_modules/mceliece8192128pcf_clean.py:ffi', - 'cffi_modules/sphincs-sha2-128f-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-128s-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-192f-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-192s-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-256f-simple_clean.py:ffi', - 'cffi_modules/sphincs-sha2-256s-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-128f-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-128s-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-192f-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-192s-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-256f-simple_clean.py:ffi', - 'cffi_modules/sphincs-shake-256s-simple_clean.py:ffi', - ], + cmdclass={'bdist_wheel': site_bdist_wheel}, + cffi_modules=[ + 'cffi_modules/dilithium2_clean.py:ffi', + 'cffi_modules/dilithium3_clean.py:ffi', + 'cffi_modules/dilithium5_clean.py:ffi', + 'cffi_modules/falcon_512_clean.py:ffi', + 'cffi_modules/falcon_1024_clean.py:ffi', + 'cffi_modules/hqc_128_clean.py:ffi', + 'cffi_modules/hqc_192_clean.py:ffi', + 'cffi_modules/hqc_256_clean.py:ffi', + 'cffi_modules/kyber512_clean.py:ffi', + 'cffi_modules/kyber768_clean.py:ffi', + 'cffi_modules/kyber1024_clean.py:ffi', + 'cffi_modules/mceliece348864f_clean.py:ffi', + 'cffi_modules/mceliece460896f_clean.py:ffi', + 'cffi_modules/mceliece6688128f_clean.py:ffi', + 'cffi_modules/mceliece6960119f_clean.py:ffi', + 'cffi_modules/mceliece8192128f_clean.py:ffi', + ##'cffi_modules/mceliece6688128pcf_clean.py:ffi', + ##'cffi_modules/mceliece6960119pcf_clean.py:ffi', + ##'cffi_modules/mceliece8192128pcf_clean.py:ffi', + 'cffi_modules/sphincs-sha2-128f-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-128s-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-192f-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-192s-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-256f-simple_clean.py:ffi', + 'cffi_modules/sphincs-sha2-256s-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-128f-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-128s-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-192f-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-192s-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-256f-simple_clean.py:ffi', + 'cffi_modules/sphincs-shake-256s-simple_clean.py:ffi', + ], )