From 2e42595804242d1e24e04f0f770a348c0c22313b Mon Sep 17 00:00:00 2001 From: "Jeevesh Rishi Dindyal (Sarvesh)" <68096721+ryndia@users.noreply.github.com> Date: Thu, 22 Dec 2022 10:37:44 +0400 Subject: [PATCH] Add ntruprime (#1328) * Add back sntrup761 --- .CMake/alg_support.cmake | 8 + CMakeLists.txt | 3 + CONTRIBUTORS | 1 + README.md | 2 +- docs/algorithms/kem/ntruprime.yml | 57 + .../copy_from_upstream/copy_from_upstream.yml | 9 + src/CMakeLists.txt | 4 + src/kem/kem.c | 13 + src/kem/kem.h | 7 +- src/kem/ntruprime/CMakeLists.txt | 29 + src/kem/ntruprime/kem_ntruprime.h | 20 + src/kem/ntruprime/kem_ntruprime_sntrup761.c | 91 + .../ntruprime/pqclean_sntrup761_avx2/LICENSE | 1 + .../ntruprime/pqclean_sntrup761_avx2/api.h | 17 + .../crypto_core_inv3sntrup761.c | 542 ++++ .../crypto_core_inv3sntrup761.h | 11 + .../crypto_core_invsntrup761.c | 217 ++ .../crypto_core_invsntrup761.h | 11 + .../crypto_core_mult3sntrup761.c | 258 ++ .../crypto_core_mult3sntrup761.h | 11 + .../crypto_core_multsntrup761.c | 313 +++ .../crypto_core_multsntrup761.h | 11 + .../crypto_core_multsntrup761_ntt.c | 2175 +++++++++++++++++ .../crypto_core_multsntrup761_ntt.h | 13 + .../crypto_core_scale3sntrup761.c | 47 + .../crypto_core_scale3sntrup761.h | 11 + .../crypto_core_weightsntrup761.c | 45 + .../crypto_core_weightsntrup761.h | 11 + .../crypto_core_wforcesntrup761.c | 61 + .../crypto_core_wforcesntrup761.h | 11 + .../crypto_declassify.h | 8 + .../crypto_decode_761x1531.c | 432 ++++ .../crypto_decode_761x1531.h | 10 + .../crypto_decode_761x3.c | 65 + .../crypto_decode_761x3.h | 10 + .../crypto_decode_761x4591.c | 432 ++++ .../crypto_decode_761x4591.h | 10 + .../crypto_decode_761xint16.c | 15 + .../crypto_decode_761xint16.h | 10 + .../crypto_decode_761xint32.c | 20 + .../crypto_decode_761xint32.h | 10 + .../crypto_decode_int16.c | 9 + .../crypto_decode_int16.h | 9 + .../crypto_encode_761x1531.c | 301 +++ .../crypto_encode_761x1531.h | 10 + .../crypto_encode_761x1531round.c | 303 +++ .../crypto_encode_761x1531round.h | 10 + .../crypto_encode_761x3.c | 64 + .../crypto_encode_761x3.h | 10 + .../crypto_encode_761x4591.c | 308 +++ .../crypto_encode_761x4591.h | 10 + .../crypto_encode_761xfreeze3.c | 31 + .../crypto_encode_761xfreeze3.h | 10 + .../crypto_encode_761xint16.c | 13 + .../crypto_encode_761xint16.h | 10 + .../crypto_encode_int16.c | 9 + .../crypto_encode_int16.h | 10 + .../crypto_sort_int32.c | 1215 +++++++++ .../crypto_sort_int32.h | 8 + .../crypto_sort_uint32.c | 18 + .../crypto_sort_uint32.h | 8 + .../crypto_verify_1039.c | 36 + .../crypto_verify_1039.h | 8 + .../ntruprime/pqclean_sntrup761_avx2/kem.c | 251 ++ .../ntruprime/pqclean_sntrup761_avx2/params.h | 76 + .../ntruprime/pqclean_sntrup761_clean/LICENSE | 1 + .../ntruprime/pqclean_sntrup761_clean/api.h | 17 + .../crypto_core_inv3sntrup761.c | 110 + .../crypto_core_inv3sntrup761.h | 11 + .../crypto_core_invsntrup761.c | 132 + .../crypto_core_invsntrup761.h | 11 + .../crypto_core_mult3sntrup761.c | 57 + .../crypto_core_mult3sntrup761.h | 11 + .../crypto_core_multsntrup761.c | 60 + .../crypto_core_multsntrup761.h | 11 + .../crypto_core_scale3sntrup761.c | 32 + .../crypto_core_scale3sntrup761.h | 11 + .../crypto_core_weightsntrup761.c | 21 + .../crypto_core_weightsntrup761.h | 11 + .../crypto_core_wforcesntrup761.c | 48 + .../crypto_core_wforcesntrup761.h | 11 + .../crypto_declassify.h | 8 + .../crypto_decode_761x1531.c | 211 ++ .../crypto_decode_761x1531.h | 10 + .../crypto_decode_761x3.c | 24 + .../crypto_decode_761x3.h | 10 + .../crypto_decode_761x4591.c | 211 ++ .../crypto_decode_761x4591.h | 10 + .../crypto_decode_761xint16.c | 15 + .../crypto_decode_761xint16.h | 10 + .../crypto_decode_761xint32.c | 20 + .../crypto_decode_761xint32.h | 10 + .../crypto_encode_761x1531.c | 119 + .../crypto_encode_761x1531.h | 10 + .../crypto_encode_761x1531round.c | 17 + .../crypto_encode_761x1531round.h | 10 + .../crypto_encode_761x3.c | 21 + .../crypto_encode_761x3.h | 10 + .../crypto_encode_761x4591.c | 147 ++ .../crypto_encode_761x4591.h | 10 + .../crypto_encode_761xfreeze3.c | 25 + .../crypto_encode_761xfreeze3.h | 10 + .../crypto_encode_761xint16.c | 13 + .../crypto_encode_761xint16.h | 10 + .../crypto_encode_int16.c | 9 + .../crypto_encode_int16.h | 10 + .../crypto_sort_int32.c | 84 + .../crypto_sort_int32.h | 8 + .../crypto_sort_uint32.c | 18 + .../crypto_sort_uint32.h | 8 + .../crypto_verify_1039.c | 13 + .../crypto_verify_1039.h | 8 + .../ntruprime/pqclean_sntrup761_clean/kem.c | 251 ++ .../pqclean_sntrup761_clean/params.h | 72 + src/oqsconfig.h.cmake | 4 + tests/KATs/kem/kats.json | 3 +- tests/constant_time/kem/issues.json | 3 +- tests/constant_time/kem/passes.json | 3 +- tests/constant_time/kem/passes/sntrup | 7 + 119 files changed, 9761 insertions(+), 5 deletions(-) create mode 100644 docs/algorithms/kem/ntruprime.yml create mode 100644 src/kem/ntruprime/CMakeLists.txt create mode 100644 src/kem/ntruprime/kem_ntruprime.h create mode 100644 src/kem/ntruprime/kem_ntruprime_sntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/LICENSE create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/api.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_declassify.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/kem.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_avx2/params.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/LICENSE create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/api.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_declassify.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.h create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/kem.c create mode 100644 src/kem/ntruprime/pqclean_sntrup761_clean/params.h create mode 100644 tests/constant_time/kem/passes/sntrup diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 56390b4800..dee130181e 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -216,6 +216,14 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AES_INSTRUCTIONS AND OQS_USE_AVX2_INSTRUCTI endif() endif() +option(OQS_ENABLE_KEM_NTRUPRIME "Enable ntruprime algorithm family" ON) + +cmake_dependent_option(OQS_ENABLE_KEM_ntruprime_sntrup761 "" ON "OQS_ENABLE_KEM_NTRUPRIME" OFF) +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) + cmake_dependent_option(OQS_ENABLE_KEM_ntruprime_sntrup761_avx2 "" ON "OQS_ENABLE_KEM_ntruprime_sntrup761" OFF) +endif() +endif() option(OQS_ENABLE_SIG_DILITHIUM "Enable dilithium algorithm family" ON) cmake_dependent_option(OQS_ENABLE_SIG_dilithium_2 "" ON "OQS_ENABLE_SIG_DILITHIUM" OFF) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9609c47cff..52481ed11c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -160,6 +160,9 @@ endif() if(OQS_ENABLE_SIG_DILITHIUM) set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/dilithium/sig_dilithium.h) endif() +if(OQS_ENABLE_KEM_NTRUPRIME) + set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/kem/ntruprime/kem_ntruprime.h) +endif() if(OQS_ENABLE_SIG_FALCON) set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/falcon/sig_falcon.h) endif() diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 79792aa652..0a370ac13d 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -33,5 +33,6 @@ John Underhill Karolin Varner Sebastian Verschoor (University of Waterloo) Thom Wiggers (Radboud University) +Dindyal Jeevesh Rishi (University of Mauritius / cyberstorm.mu) See additional contributors at https://github.com/open-quantum-safe/liboqs/graphs/contributors diff --git a/README.md b/README.md index 448e0d50cf..9c8293bc34 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Note that for algorithms marked with a dagger (†), liboqs contains at least on ### Limitations and Security -While at the time of this writing there are no vulnerabilities known in any of the quantum-safe algorithms used in this library, caution is advised when deploying quantum-safe algorithms as most of the algorithms and software have not been subject to the same degree of scrutiny as for currently deployed algorithms. Particular attention should be paid to guidance provided by the standards community, especially from the NIST [Post-Quantum Cryptography Standardization](https://csrc.nist.gov/Projects/Post-Quantum-Cryptography/Post-Quantum-Cryptography-Standardization) project. As research advances, the supported algorithms may see rapid changes in their security, and may even prove insecure against both classical and quantum computers. +While at the time of this writing there are no vulnerabilities known in any of the quantum-safe algorithms used in this library, caution is advised when deploying quantum-safe algorithms as most of the algorithms and software have not been subject to the same degree of scrutiny as for currently deployed algorithms. Particular attention should be paid to guidance provided by the standards community, especially from the NIST [Post-Quantum Cryptography Standardization](https://csrc.nist.gov/Projects/Post-Quantum-Cryptography/Post-Quantum-Cryptography-Standardization) project. As research advances, the supported algorithms may see rapid changes in their security, and may even prove insecure against both classical and quantum computers. Moreover, note that the `sntrup761` is only included for interop testing. liboqs does not intend to "pick winners": algorithm support is informed by the NIST PQC standardization project. We strongly recommend that applications and protocols rely on the outcomes of ths effort when deploying post-quantum cryptography. diff --git a/docs/algorithms/kem/ntruprime.yml b/docs/algorithms/kem/ntruprime.yml new file mode 100644 index 0000000000..d78ad5edc7 --- /dev/null +++ b/docs/algorithms/kem/ntruprime.yml @@ -0,0 +1,57 @@ +name: NTRU-Prime +type: kem +principal-submitters: +- Daniel J. Bernstein +- Billy Bob Brumley +- Ming-Shing Chen +- Chitchanok Chuengsatiansup +- Tanja Lange +- Adrian Marotzke +- Bo-Yuan Peng +- Nicola Tuveri +- Christine van Vredendaal +- Bo-Yin Yang +crypto-assumption: NTRU +website: https://ntruprime.cr.yp.to +nist-round: 3 +spec-version: supercop-20200826 +upstream-ancestors: +- https://github.com/jschanck/package-pqclean/tree/4d9f08c3/ntruprime +- supercop-20210604 +parameter-sets: +- name: sntrup761 + claimed-nist-level: 2 + claimed-security: IND-CCA2 + length-ciphertext: 1039 + length-public-key: 1158 + length-secret-key: 1763 + length-shared-secret: 32 + implementations-switch-on-runtime-cpu-features: true + implementations: + - upstream-id: clean + supported-platforms: all + common-crypto: + - AES: liboqs + - SHA2: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + upstream: primary-upstream + - upstream-id: avx2 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + common-crypto: + - AES: liboqs + - SHA2: liboqs + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + upstream: primary-upstream +primary-upstream: + spdx-license-identifier: Public domain + source: https://github.com/PQClean/PQClean/commit/4c9e5a3aa715cc8d1d0e377e4e6e682ebd7602d6 diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index 610f9045df..c1d79b52ee 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -121,6 +121,15 @@ kems: scheme: "1024_90s" pqclean_scheme: kyber1024-90s pretty_name_full: Kyber1024-90s + - + name: ntruprime + default_implementation: clean + upstream_location: pqclean + schemes: + - + scheme: sntrup761 + pqclean_scheme: sntrup761 + pretty_name_full: sntrup761 sigs: - name: dilithium diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f1a9c1f46a..1c29ddfdd7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,6 +34,10 @@ if(OQS_ENABLE_SIG_DILITHIUM) add_subdirectory(sig/dilithium) set(SIG_OBJS ${SIG_OBJS} ${DILITHIUM_OBJS}) endif() +if(OQS_ENABLE_KEM_NTRUPRIME) + add_subdirectory(kem/ntruprime) + set(KEM_OBJS ${KEM_OBJS} ${NTRUPRIME_OBJS}) +endif() if(OQS_ENABLE_SIG_FALCON) add_subdirectory(sig/falcon) set(SIG_OBJS ${SIG_OBJS} ${FALCON_OBJS}) diff --git a/src/kem/kem.c b/src/kem/kem.c index 2292a9e6ad..0cb6dc41b9 100644 --- a/src/kem/kem.c +++ b/src/kem/kem.c @@ -36,6 +36,7 @@ OQS_API const char *OQS_KEM_alg_identifier(size_t i) { OQS_KEM_alg_kyber_512_90s, OQS_KEM_alg_kyber_768_90s, OQS_KEM_alg_kyber_1024_90s, + OQS_KEM_alg_ntruprime_sntrup761, ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END OQS_KEM_alg_frodokem_640_aes, OQS_KEM_alg_frodokem_640_shake, @@ -184,6 +185,12 @@ OQS_API int OQS_KEM_alg_is_enabled(const char *method_name) { return 1; #else return 0; +#endif + } else if (0 == strcasecmp(method_name, OQS_KEM_alg_ntruprime_sntrup761)) { +#ifdef OQS_ENABLE_KEM_ntruprime_sntrup761 + return 1; +#else + return 0; #endif ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ENABLED_CASE_END } else if (0 == strcasecmp(method_name, OQS_KEM_alg_frodokem_640_aes)) { @@ -358,6 +365,12 @@ OQS_API OQS_KEM *OQS_KEM_new(const char *method_name) { return OQS_KEM_kyber_1024_90s_new(); #else return NULL; +#endif + } else if (0 == strcasecmp(method_name, OQS_KEM_alg_ntruprime_sntrup761)) { +#ifdef OQS_ENABLE_KEM_ntruprime_sntrup761 + return OQS_KEM_ntruprime_sntrup761_new(); +#else + return NULL; #endif ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_NEW_CASE_END } else if (0 == strcasecmp(method_name, OQS_KEM_alg_frodokem_640_aes)) { diff --git a/src/kem/kem.h b/src/kem/kem.h index 02930ddd18..7803b48ad5 100644 --- a/src/kem/kem.h +++ b/src/kem/kem.h @@ -74,6 +74,8 @@ extern "C" { #define OQS_KEM_alg_kyber_768_90s "Kyber768-90s" /** Algorithm identifier for Kyber1024-90s KEM. */ #define OQS_KEM_alg_kyber_1024_90s "Kyber1024-90s" +/** Algorithm identifier for sntrup761 KEM. */ +#define OQS_KEM_alg_ntruprime_sntrup761 "sntrup761" ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END /** Algorithm identifier for FrodoKEM-640-AES KEM. */ #define OQS_KEM_alg_frodokem_640_aes "FrodoKEM-640-AES" @@ -90,7 +92,7 @@ extern "C" { // EDIT-WHEN-ADDING-KEM ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_START /** Number of algorithm identifiers above. */ -#define OQS_KEM_algs_length 27 +#define OQS_KEM_algs_length 28 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_END /** @@ -271,6 +273,9 @@ OQS_API void OQS_KEM_free(OQS_KEM *kem); #ifdef OQS_ENABLE_KEM_KYBER #include #endif /* OQS_ENABLE_KEM_KYBER */ +#ifdef OQS_ENABLE_KEM_NTRUPRIME +#include +#endif /* OQS_ENABLE_KEM_NTRUPRIME */ ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_INCLUDE_END #ifdef OQS_ENABLE_KEM_FRODOKEM #include diff --git a/src/kem/ntruprime/CMakeLists.txt b/src/kem/ntruprime/CMakeLists.txt new file mode 100644 index 0000000000..97eb85d539 --- /dev/null +++ b/src/kem/ntruprime/CMakeLists.txt @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: MIT + +# This file was generated by +# scripts/copy_from_upstream/copy_from_upstream.py + +set(_NTRUPRIME_OBJS "") + +if(OQS_ENABLE_KEM_ntruprime_sntrup761) + add_library(ntruprime_sntrup761_clean OBJECT kem_ntruprime_sntrup761.c pqclean_sntrup761_clean/crypto_core_inv3sntrup761.c pqclean_sntrup761_clean/crypto_core_invsntrup761.c pqclean_sntrup761_clean/crypto_core_mult3sntrup761.c pqclean_sntrup761_clean/crypto_core_multsntrup761.c pqclean_sntrup761_clean/crypto_core_scale3sntrup761.c pqclean_sntrup761_clean/crypto_core_weightsntrup761.c pqclean_sntrup761_clean/crypto_core_wforcesntrup761.c pqclean_sntrup761_clean/crypto_decode_761x1531.c pqclean_sntrup761_clean/crypto_decode_761x3.c pqclean_sntrup761_clean/crypto_decode_761x4591.c pqclean_sntrup761_clean/crypto_decode_761xint16.c pqclean_sntrup761_clean/crypto_decode_761xint32.c pqclean_sntrup761_clean/crypto_encode_761x1531.c pqclean_sntrup761_clean/crypto_encode_761x1531round.c pqclean_sntrup761_clean/crypto_encode_761x3.c pqclean_sntrup761_clean/crypto_encode_761x4591.c pqclean_sntrup761_clean/crypto_encode_761xfreeze3.c pqclean_sntrup761_clean/crypto_encode_761xint16.c pqclean_sntrup761_clean/crypto_encode_int16.c pqclean_sntrup761_clean/crypto_sort_int32.c pqclean_sntrup761_clean/crypto_sort_uint32.c pqclean_sntrup761_clean/crypto_verify_1039.c pqclean_sntrup761_clean/kem.c) + target_include_directories(ntruprime_sntrup761_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_sntrup761_clean) + target_include_directories(ntruprime_sntrup761_clean PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + if (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + target_compile_definitions(ntruprime_sntrup761_clean PRIVATE old_gas_syntax) + endif() + set(_NTRUPRIME_OBJS ${_NTRUPRIME_OBJS} $) +endif() + +if(OQS_ENABLE_KEM_ntruprime_sntrup761_avx2) + add_library(ntruprime_sntrup761_avx2 OBJECT pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.c pqclean_sntrup761_avx2/crypto_core_invsntrup761.c pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.c pqclean_sntrup761_avx2/crypto_core_multsntrup761.c pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.c pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.c pqclean_sntrup761_avx2/crypto_core_weightsntrup761.c pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.c pqclean_sntrup761_avx2/crypto_decode_761x1531.c pqclean_sntrup761_avx2/crypto_decode_761x3.c pqclean_sntrup761_avx2/crypto_decode_761x4591.c pqclean_sntrup761_avx2/crypto_decode_761xint16.c pqclean_sntrup761_avx2/crypto_decode_761xint32.c pqclean_sntrup761_avx2/crypto_decode_int16.c pqclean_sntrup761_avx2/crypto_encode_761x1531.c pqclean_sntrup761_avx2/crypto_encode_761x1531round.c pqclean_sntrup761_avx2/crypto_encode_761x3.c pqclean_sntrup761_avx2/crypto_encode_761x4591.c pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.c pqclean_sntrup761_avx2/crypto_encode_761xint16.c pqclean_sntrup761_avx2/crypto_encode_int16.c pqclean_sntrup761_avx2/crypto_sort_int32.c pqclean_sntrup761_avx2/crypto_sort_uint32.c pqclean_sntrup761_avx2/crypto_verify_1039.c pqclean_sntrup761_avx2/kem.c) + target_include_directories(ntruprime_sntrup761_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_sntrup761_avx2) + target_include_directories(ntruprime_sntrup761_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(ntruprime_sntrup761_avx2 PRIVATE -mavx2 ) + if (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + target_compile_definitions(ntruprime_sntrup761_avx2 PRIVATE old_gas_syntax) + endif() + set(_NTRUPRIME_OBJS ${_NTRUPRIME_OBJS} $) +endif() + +set(NTRUPRIME_OBJS ${_NTRUPRIME_OBJS} PARENT_SCOPE) diff --git a/src/kem/ntruprime/kem_ntruprime.h b/src/kem/ntruprime/kem_ntruprime.h new file mode 100644 index 0000000000..bdbab28710 --- /dev/null +++ b/src/kem/ntruprime/kem_ntruprime.h @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: MIT + +#ifndef OQS_KEM_NTRUPRIME_H +#define OQS_KEM_NTRUPRIME_H + +#include + +#ifdef OQS_ENABLE_KEM_ntruprime_sntrup761 +#define OQS_KEM_ntruprime_sntrup761_length_public_key 1158 +#define OQS_KEM_ntruprime_sntrup761_length_secret_key 1763 +#define OQS_KEM_ntruprime_sntrup761_length_ciphertext 1039 +#define OQS_KEM_ntruprime_sntrup761_length_shared_secret 32 +OQS_KEM *OQS_KEM_ntruprime_sntrup761_new(void); +OQS_API OQS_STATUS OQS_KEM_ntruprime_sntrup761_keypair(uint8_t *public_key, uint8_t *secret_key); +OQS_API OQS_STATUS OQS_KEM_ntruprime_sntrup761_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key); +OQS_API OQS_STATUS OQS_KEM_ntruprime_sntrup761_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key); +#endif + +#endif + diff --git a/src/kem/ntruprime/kem_ntruprime_sntrup761.c b/src/kem/ntruprime/kem_ntruprime_sntrup761.c new file mode 100644 index 0000000000..bc69b36209 --- /dev/null +++ b/src/kem/ntruprime/kem_ntruprime_sntrup761.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: MIT + +#include + +#include + +#if defined(OQS_ENABLE_KEM_ntruprime_sntrup761) + +OQS_KEM *OQS_KEM_ntruprime_sntrup761_new(void) { + + OQS_KEM *kem = malloc(sizeof(OQS_KEM)); + if (kem == NULL) { + return NULL; + } + kem->method_name = OQS_KEM_alg_ntruprime_sntrup761; + kem->alg_version = "supercop-20210604 via https://github.com/mkannwischer/package-pqclean/tree/5714c895/ntruprime"; + + kem->claimed_nist_level = 2; + kem->ind_cca = true; + + kem->length_public_key = OQS_KEM_ntruprime_sntrup761_length_public_key; + kem->length_secret_key = OQS_KEM_ntruprime_sntrup761_length_secret_key; + kem->length_ciphertext = OQS_KEM_ntruprime_sntrup761_length_ciphertext; + kem->length_shared_secret = OQS_KEM_ntruprime_sntrup761_length_shared_secret; + + kem->keypair = OQS_KEM_ntruprime_sntrup761_keypair; + kem->encaps = OQS_KEM_ntruprime_sntrup761_encaps; + kem->decaps = OQS_KEM_ntruprime_sntrup761_decaps; + + return kem; +} + +extern int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +extern int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); + +#if defined(OQS_ENABLE_KEM_ntruprime_sntrup761_avx2) +extern int PQCLEAN_SNTRUP761_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +extern int PQCLEAN_SNTRUP761_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int PQCLEAN_SNTRUP761_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif + +OQS_API OQS_STATUS OQS_KEM_ntruprime_sntrup761_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_ntruprime_sntrup761_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) PQCLEAN_SNTRUP761_AVX2_crypto_kem_keypair(public_key, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_SNTRUP761_CLEAN_crypto_kem_keypair(public_key, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) PQCLEAN_SNTRUP761_CLEAN_crypto_kem_keypair(public_key, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_KEM_ntruprime_sntrup761_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_ENABLE_KEM_ntruprime_sntrup761_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) PQCLEAN_SNTRUP761_AVX2_crypto_kem_enc(ciphertext, shared_secret, public_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_SNTRUP761_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) PQCLEAN_SNTRUP761_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); +#endif +} + +OQS_API OQS_STATUS OQS_KEM_ntruprime_sntrup761_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_KEM_ntruprime_sntrup761_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) PQCLEAN_SNTRUP761_AVX2_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) PQCLEAN_SNTRUP761_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) PQCLEAN_SNTRUP761_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); +#endif +} + +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/LICENSE b/src/kem/ntruprime/pqclean_sntrup761_avx2/LICENSE new file mode 100644 index 0000000000..d5d21fff6d --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/LICENSE @@ -0,0 +1 @@ +Public Domain diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/api.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/api.h new file mode 100644 index 0000000000..f154c14458 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/api.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_API_H +#define PQCLEAN_SNTRUP761_AVX2_API_H + +#include + + +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ALGNAME "sntrup761" + +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_SECRETKEYBYTES 1763 +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_PUBLICKEYBYTES 1158 +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CIPHERTEXTBYTES 1039 +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_BYTES 32 + +int PQCLEAN_SNTRUP761_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +int PQCLEAN_SNTRUP761_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk); +int PQCLEAN_SNTRUP761_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.c new file mode 100644 index 0000000000..03c327ebaa --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.c @@ -0,0 +1,542 @@ +#include "crypto_core_inv3sntrup761.h" +#include + + +#define int8 int8_t +typedef int8 small; + +#define p 761 +#define ppad 768 +#define numvec 3 + +typedef __m256i vec256; + +/* +This code stores 768-coeff poly as vec256[3]. +Order of 256 coefficients in each vec256 +is optimized in light of costs of vector instructions: + 0,4,...,252 in 64-bit word; + 1,5,...,253 in 64-bit word; + 2,6,...,254 in 64-bit word; + 3,7,...,255 in 64-bit word. +*/ + +static inline void vec256_frombits(vec256 *v, const small *b) { + int i; + + for (i = 0; i < numvec; ++i) { + vec256 b0 = _mm256_loadu_si256((vec256 *) b); + b += 32; /* 0,1,...,31 */ + vec256 b1 = _mm256_loadu_si256((vec256 *) b); + b += 32; /* 32,33,... */ + vec256 b2 = _mm256_loadu_si256((vec256 *) b); + b += 32; + vec256 b3 = _mm256_loadu_si256((vec256 *) b); + b += 32; + vec256 b4 = _mm256_loadu_si256((vec256 *) b); + b += 32; + vec256 b5 = _mm256_loadu_si256((vec256 *) b); + b += 32; + vec256 b6 = _mm256_loadu_si256((vec256 *) b); + b += 32; + vec256 b7 = _mm256_loadu_si256((vec256 *) b); + b += 32; + + vec256 c0 = _mm256_unpacklo_epi32(b0, b1); /* 0 1 2 3 32 33 34 35 4 5 6 7 36 37 38 39 ... 55 */ + vec256 c1 = _mm256_unpackhi_epi32(b0, b1); /* 8 9 10 11 40 41 42 43 ... 63 */ + vec256 c2 = _mm256_unpacklo_epi32(b2, b3); + vec256 c3 = _mm256_unpackhi_epi32(b2, b3); + vec256 c4 = _mm256_unpacklo_epi32(b4, b5); + vec256 c5 = _mm256_unpackhi_epi32(b4, b5); + vec256 c6 = _mm256_unpacklo_epi32(b6, b7); + vec256 c7 = _mm256_unpackhi_epi32(b6, b7); + + vec256 d0 = c0 | _mm256_slli_epi32(c1, 2); /* 0 8, 1 9, 2 10, 3 11, 32 40, 33 41, ..., 55 63 */ + vec256 d2 = c2 | _mm256_slli_epi32(c3, 2); + vec256 d4 = c4 | _mm256_slli_epi32(c5, 2); + vec256 d6 = c6 | _mm256_slli_epi32(c7, 2); + + vec256 e0 = _mm256_unpacklo_epi64(d0, d2); + vec256 e2 = _mm256_unpackhi_epi64(d0, d2); + vec256 e4 = _mm256_unpacklo_epi64(d4, d6); + vec256 e6 = _mm256_unpackhi_epi64(d4, d6); + + vec256 f0 = e0 | _mm256_slli_epi32(e2, 1); + vec256 f4 = e4 | _mm256_slli_epi32(e6, 1); + + vec256 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + vec256 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + + vec256 h = g0 | _mm256_slli_epi32(g4, 4); + +#define TRANSPOSE _mm256_set_epi8( 31,27,23,19, 30,26,22,18, 29,25,21,17, 28,24,20,16, 15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0 ) + h = _mm256_shuffle_epi8(h, TRANSPOSE); + h = _mm256_permute4x64_epi64(h, 0xd8); + h = _mm256_shuffle_epi32(h, 0xd8); + + *v++ = h; + } +} + +static inline void vec256_tobits(const vec256 *v, small *b) { + int i; + + for (i = 0; i < numvec; ++i) { + vec256 h = *v++; + + h = _mm256_shuffle_epi32(h, 0xd8); + h = _mm256_permute4x64_epi64(h, 0xd8); + h = _mm256_shuffle_epi8(h, TRANSPOSE); + + vec256 g0 = h & _mm256_set1_epi8(15); + vec256 g4 = _mm256_srli_epi32(h, 4) & _mm256_set1_epi8(15); + + vec256 f0 = _mm256_permute2x128_si256(g0, g4, 0x20); + vec256 f4 = _mm256_permute2x128_si256(g0, g4, 0x31); + + vec256 e0 = f0 & _mm256_set1_epi8(5); + vec256 e2 = _mm256_srli_epi32(f0, 1) & _mm256_set1_epi8(5); + vec256 e4 = f4 & _mm256_set1_epi8(5); + vec256 e6 = _mm256_srli_epi32(f4, 1) & _mm256_set1_epi8(5); + + vec256 d0 = _mm256_unpacklo_epi32(e0, e2); + vec256 d2 = _mm256_unpackhi_epi32(e0, e2); + vec256 d4 = _mm256_unpacklo_epi32(e4, e6); + vec256 d6 = _mm256_unpackhi_epi32(e4, e6); + + vec256 c0 = d0 & _mm256_set1_epi8(1); + vec256 c1 = _mm256_srli_epi32(d0, 2) & _mm256_set1_epi8(1); + vec256 c2 = d2 & _mm256_set1_epi8(1); + vec256 c3 = _mm256_srli_epi32(d2, 2) & _mm256_set1_epi8(1); + vec256 c4 = d4 & _mm256_set1_epi8(1); + vec256 c5 = _mm256_srli_epi32(d4, 2) & _mm256_set1_epi8(1); + vec256 c6 = d6 & _mm256_set1_epi8(1); + vec256 c7 = _mm256_srli_epi32(d6, 2) & _mm256_set1_epi8(1); + + vec256 b0 = _mm256_unpacklo_epi64(c0, c1); + vec256 b1 = _mm256_unpackhi_epi64(c0, c1); + vec256 b2 = _mm256_unpacklo_epi64(c2, c3); + vec256 b3 = _mm256_unpackhi_epi64(c2, c3); + vec256 b4 = _mm256_unpacklo_epi64(c4, c5); + vec256 b5 = _mm256_unpackhi_epi64(c4, c5); + vec256 b6 = _mm256_unpacklo_epi64(c6, c7); + vec256 b7 = _mm256_unpackhi_epi64(c6, c7); + + _mm256_storeu_si256((vec256 *) b, b0); + b += 32; + _mm256_storeu_si256((vec256 *) b, b1); + b += 32; + _mm256_storeu_si256((vec256 *) b, b2); + b += 32; + _mm256_storeu_si256((vec256 *) b, b3); + b += 32; + _mm256_storeu_si256((vec256 *) b, b4); + b += 32; + _mm256_storeu_si256((vec256 *) b, b5); + b += 32; + _mm256_storeu_si256((vec256 *) b, b6); + b += 32; + _mm256_storeu_si256((vec256 *) b, b7); + b += 32; + } +} + +static void vec256_init(vec256 *G0, vec256 *G1, const small *s) { + int i; + small srev[ppad + (ppad - p)]; + small si; + small g0[ppad]; + small g1[ppad]; + + for (i = 0; i < p; ++i) { + srev[ppad - 1 - i] = s[i]; + } + for (i = 0; i < ppad - p; ++i) { + srev[i] = 0; + } + for (i = p; i < ppad; ++i) { + srev[i + ppad - p] = 0; + } + + for (i = 0; i < ppad; ++i) { + si = srev[i + ppad - p]; + g0[i] = si & 1; + g1[i] = (si >> 1) & g0[i]; + } + + vec256_frombits(G0, g0); + vec256_frombits(G1, g1); +} + +static void vec256_final(small *out, const vec256 *V0, const vec256 *V1) { + int i; + small v0[ppad]; + small v1[ppad]; + small v[ppad]; + small vrev[ppad + (ppad - p)]; + + vec256_tobits(V0, v0); + vec256_tobits(V1, v1); + + for (i = 0; i < ppad; ++i) { + v[i] = (small) (v0[i] + 2 * v1[i] - 4 * (v0[i] & v1[i])); + } + + for (i = 0; i < ppad; ++i) { + vrev[i] = v[ppad - 1 - i]; + } + for (i = ppad; i < ppad + (ppad - p); ++i) { + vrev[i] = 0; + } + + for (i = 0; i < p; ++i) { + out[i] = vrev[i + ppad - p]; + } +} + +static inline int negative_mask(int x) { + return x >> 31; +} + +static inline void vec256_swap(vec256 *f, vec256 *g, int len, vec256 mask) { + vec256 flip; + int i; + + for (i = 0; i < len; ++i) { + flip = mask & (f[i] ^ g[i]); + f[i] ^= flip; + g[i] ^= flip; + } +} + +static inline void vec256_scale(vec256 *f0, vec256 *f1, const vec256 c0, const vec256 c1) { + int i; + + for (i = 0; i < numvec; ++i) { + vec256 f0i = f0[i]; + vec256 f1i = f1[i]; + + f0i &= c0; + f1i ^= c1; + f1i &= f0i; + + f0[i] = f0i; + f1[i] = f1i; + } +} + +static inline void vec256_eliminate(vec256 *f0, vec256 *f1, vec256 *g0, vec256 *g1, int len, const vec256 c0, const vec256 c1) { + int i; + + for (i = 0; i < len; ++i) { + vec256 f0i = f0[i]; + vec256 f1i = f1[i]; + vec256 g0i = g0[i]; + vec256 g1i = g1[i]; + vec256 t; + + f0i &= c0; + f1i ^= c1; + f1i &= f0i; + + t = g0i ^ f0i; + g0[i] = t | (g1i ^ f1i); + g1[i] = (g1i ^ f0i) & (f1i ^ t); + } +} + +static inline int vec256_bit0mask(vec256 *f) { + return -(_mm_cvtsi128_si32(_mm256_castsi256_si128(f[0])) & 1); +} + +static inline void vec256_divx_1(vec256 *f) { + vec256 f0 = f[0]; + + unsigned long long low0 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); + + low0 = low0 >> 1; + + f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, (long long) low0), 0x3); + + f[0] = _mm256_permute4x64_epi64(f0, 0x39); +} + +static inline void vec256_divx_2(vec256 *f) { + vec256 f0 = f[0]; + vec256 f1 = f[1]; + + unsigned long long low0 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); + unsigned long long low1 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); + + low0 = (low0 >> 1) | (low1 << 63); + low1 = low1 >> 1; + + f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, (long long) low0), 0x3); + f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, (long long) low1), 0x3); + + f[0] = _mm256_permute4x64_epi64(f0, 0x39); + f[1] = _mm256_permute4x64_epi64(f1, 0x39); +} + +static inline void vec256_divx_3(vec256 *f) { + vec256 f0 = f[0]; + vec256 f1 = f[1]; + vec256 f2 = f[2]; + + unsigned long long low0 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); + unsigned long long low1 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); + unsigned long long low2 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f2)); + + low0 = (low0 >> 1) | (low1 << 63); + low1 = (low1 >> 1) | (low2 << 63); + low2 = low2 >> 1; + + f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, (long long) low0), 0x3); + f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, (long long) low1), 0x3); + f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, (long long) low2), 0x3); + + f[0] = _mm256_permute4x64_epi64(f0, 0x39); + f[1] = _mm256_permute4x64_epi64(f1, 0x39); + f[2] = _mm256_permute4x64_epi64(f2, 0x39); +} + +static inline void vec256_timesx_1(vec256 *f) { + vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); + + unsigned long long low0 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); + + low0 = low0 << 1; + + f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, (long long) low0), 0x3); + + f[0] = f0; +} + +static inline void vec256_timesx_2(vec256 *f) { + vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); + vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93); + + unsigned long long low0 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f0)); + unsigned long long low1 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f1)); + + low1 = (low1 << 1) | (low0 >> 63); + low0 = low0 << 1; + + f0 = _mm256_blend_epi32(f0, _mm256_set_epi64x(0, 0, 0, (long long) low0), 0x3); + f1 = _mm256_blend_epi32(f1, _mm256_set_epi64x(0, 0, 0, (long long) low1), 0x3); + + f[0] = f0; + f[1] = f1; +} + +static inline void vec256_timesx_3(vec256 *f) { + vec256 f0 = _mm256_permute4x64_epi64(f[0], 0x93); + vec256 f1 = _mm256_permute4x64_epi64(f[1], 0x93); + vec256 f2 = _mm256_permute4x64_epi64(f[2], 0x93); + + unsigned long long low0 = *(unsigned long long *) &f0; + unsigned long long low1 = *(unsigned long long *) &f1; + unsigned long long low2 = (unsigned long long) _mm_cvtsi128_si64(_mm256_castsi256_si128(f2)); + + low2 = (low2 << 1) | (low1 >> 63); + low1 = (low1 << 1) | (low0 >> 63); + low0 = low0 << 1; + + *(unsigned long long *) &f0 = low0; + *(unsigned long long *) &f1 = low1; + f2 = _mm256_blend_epi32(f2, _mm256_set_epi64x(0, 0, 0, (long long) low2), 0x3); + + f[0] = f0; + f[1] = f1; + f[2] = f2; +} + + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + small *out = (void *) outbytes; + small *in = (void *) inbytes; + vec256 F0[numvec]; + vec256 F1[numvec]; + vec256 G0[numvec]; + vec256 G1[numvec]; + vec256 V0[numvec]; + vec256 V1[numvec]; + vec256 R0[numvec]; + vec256 R1[numvec]; + vec256 c0vec, c1vec; + int loop; + int c0, c1; + int minusdelta = -1; + int swapmask; + vec256 swapvec; + + vec256_init(G0, G1, in); + F0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1); + F0[1] = _mm256_set1_epi32(0); + F0[2] = _mm256_set_epi32(0, 0, 0, 0, 1073741824, 0, 1073741824, 0); + F1[0] = _mm256_set1_epi32(0); + F1[1] = _mm256_set1_epi32(0); + F1[2] = _mm256_set_epi32(0, 0, 0, 0, 1073741824, 0, 1073741824, 0); + + V0[0] = _mm256_set1_epi32(0); + V1[0] = _mm256_set1_epi32(0); + V0[1] = _mm256_set1_epi32(0); + V1[1] = _mm256_set1_epi32(0); + V0[2] = _mm256_set1_epi32(0); + V1[2] = _mm256_set1_epi32(0); + + R0[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1); + R1[0] = _mm256_set1_epi32(0); + R0[1] = _mm256_set1_epi32(0); + R1[1] = _mm256_set1_epi32(0); + R0[2] = _mm256_set1_epi32(0); + R1[2] = _mm256_set1_epi32(0); + + for (loop = 256; loop > 0; --loop) { + vec256_timesx_1(V0); + vec256_timesx_1(V1); + swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); + + c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); + c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); + c1 &= c0; + + minusdelta ^= swapmask & (minusdelta ^ -minusdelta); + minusdelta -= 1; + + swapvec = _mm256_set1_epi32(swapmask); + vec256_swap(F0, G0, 3, swapvec); + vec256_swap(F1, G1, 3, swapvec); + + c0vec = _mm256_set1_epi32(c0); + c1vec = _mm256_set1_epi32(c1); + + vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec); + vec256_divx_3(G0); + vec256_divx_3(G1); + + vec256_swap(V0, R0, 1, swapvec); + vec256_swap(V1, R1, 1, swapvec); + vec256_eliminate(V0, V1, R0, R1, 1, c0vec, c1vec); + } + + for (loop = 256; loop > 0; --loop) { + vec256_timesx_2(V0); + vec256_timesx_2(V1); + swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); + + c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); + c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); + c1 &= c0; + + minusdelta ^= swapmask & (minusdelta ^ -minusdelta); + minusdelta -= 1; + + swapvec = _mm256_set1_epi32(swapmask); + vec256_swap(F0, G0, 3, swapvec); + vec256_swap(F1, G1, 3, swapvec); + + c0vec = _mm256_set1_epi32(c0); + c1vec = _mm256_set1_epi32(c1); + + vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec); + vec256_divx_3(G0); + vec256_divx_3(G1); + + vec256_swap(V0, R0, 2, swapvec); + vec256_swap(V1, R1, 2, swapvec); + vec256_eliminate(V0, V1, R0, R1, 2, c0vec, c1vec); + } + + for (loop = 497; loop > 0; --loop) { + vec256_timesx_3(V0); + vec256_timesx_3(V1); + swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); + + c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); + c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); + c1 &= c0; + + minusdelta ^= swapmask & (minusdelta ^ -minusdelta); + minusdelta -= 1; + + swapvec = _mm256_set1_epi32(swapmask); + vec256_swap(F0, G0, 3, swapvec); + vec256_swap(F1, G1, 3, swapvec); + + c0vec = _mm256_set1_epi32(c0); + c1vec = _mm256_set1_epi32(c1); + + vec256_eliminate(F0, F1, G0, G1, 3, c0vec, c1vec); + vec256_divx_3(G0); + vec256_divx_3(G1); + + vec256_swap(V0, R0, 3, swapvec); + vec256_swap(V1, R1, 3, swapvec); + vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec); + } + + for (loop = 256; loop > 0; --loop) { + vec256_timesx_3(V0); + vec256_timesx_3(V1); + swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); + + c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); + c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); + c1 &= c0; + + minusdelta ^= swapmask & (minusdelta ^ -minusdelta); + minusdelta -= 1; + + swapvec = _mm256_set1_epi32(swapmask); + vec256_swap(F0, G0, 2, swapvec); + vec256_swap(F1, G1, 2, swapvec); + + c0vec = _mm256_set1_epi32(c0); + c1vec = _mm256_set1_epi32(c1); + + vec256_eliminate(F0, F1, G0, G1, 2, c0vec, c1vec); + vec256_divx_2(G0); + vec256_divx_2(G1); + + vec256_swap(V0, R0, 3, swapvec); + vec256_swap(V1, R1, 3, swapvec); + vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec); + } + + for (loop = 256; loop > 0; --loop) { + vec256_timesx_3(V0); + vec256_timesx_3(V1); + swapmask = negative_mask(minusdelta) & vec256_bit0mask(G0); + + c0 = vec256_bit0mask(F0) & vec256_bit0mask(G0); + c1 = vec256_bit0mask(F1) ^ vec256_bit0mask(G1); + c1 &= c0; + + minusdelta ^= swapmask & (minusdelta ^ -minusdelta); + minusdelta -= 1; + + swapvec = _mm256_set1_epi32(swapmask); + vec256_swap(F0, G0, 1, swapvec); + vec256_swap(F1, G1, 1, swapvec); + + c0vec = _mm256_set1_epi32(c0); + c1vec = _mm256_set1_epi32(c1); + + vec256_eliminate(F0, F1, G0, G1, 1, c0vec, c1vec); + vec256_divx_1(G0); + vec256_divx_1(G1); + + vec256_swap(V0, R0, 3, swapvec); + vec256_swap(V1, R1, 3, swapvec); + vec256_eliminate(V0, V1, R0, R1, 3, c0vec, c1vec); + } + + c0vec = _mm256_set1_epi32(vec256_bit0mask(F0)); + c1vec = _mm256_set1_epi32(vec256_bit0mask(F1)); + vec256_scale(V0, V1, c0vec, c1vec); + + vec256_final(out, V0, V1); + out[p] = (small) negative_mask(minusdelta); + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.h new file mode 100644 index 0000000000..3ad254757e --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_inv3sntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INV3SNTRUP761_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INV3SNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_OUTPUTBYTES 762 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.c new file mode 100644 index 0000000000..e27bedfb7a --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.c @@ -0,0 +1,217 @@ +#include "crypto_core_invsntrup761.h" +#include "params.h" +#include + +#define int8 int8_t +#define int16 int16_t +#define int32 int32_t +#define uint16 uint16_t +#define uint32 uint32_t +#define uint64 uint64_t + + + +/* ----- masks */ + +/* return -1 if x!=0; else return 0 */ +static int int16_nonzero_mask(int16 x) { + uint16 u = (uint16) x; /* 0, else 1...65535 */ + uint32 v = u; /* 0, else 1...65535 */ + v = ~v + 1; /* 0, else 2^32-65535...2^32-1 */ + v >>= 31; /* 0, else 1 */ + return -(int) v; /* 0, else -1 */ +} + +/* return -1 if x<0; otherwise return 0 */ +static int int16_negative_mask(int16 x) { + return x >> 15; /* XXX: theoretically need gcc -fwrapv for this */ +} + +/* ----- arithmetic mod q */ + +typedef int8 small; + +typedef int16 Fq; +/* always represented as -(q-1)/2...(q-1)/2 */ + +/* works for -7000000 < x < 7000000 if q in 4591, 4621, 5167, 6343, 7177, 7879 */ +static Fq Fq_freeze(int32 x) { + x -= q * ((q18 * x) >> 18); + x -= q * ((q27 * x + 67108864) >> 27); + return (Fq) x; +} + +static Fq Fq_bigfreeze(int32 x) { + x -= q * ((q14 * x) >> 14); + x -= q * ((q18 * x) >> 18); + x -= q * ((q27 * x + 67108864) >> 27); + x -= q * ((q27 * x + 67108864) >> 27); + return x; +} + +/* nonnegative e */ +static Fq Fq_pow(Fq a, int e) { + if (e == 0) { + return 1; + } + if (e == 1) { + return a; + } + if (e & 1) { + return Fq_bigfreeze(a * (int32)Fq_pow(a, e - 1)); + } + a = Fq_bigfreeze(a * (int32)a); + return Fq_pow(a, e >> 1); +} + +static Fq Fq_recip(Fq a) { + return Fq_pow(a, q - 2); +} + +/* ----- more */ + +#define qvec _mm256_set1_epi16(q) +#define qinvvec _mm256_set1_epi16(qinv) + +static inline __m256i montproduct(__m256i x, __m256i y, __m256i yqinv) { + __m256i hi, d, e; + + d = _mm256_mullo_epi16(x, yqinv); + hi = _mm256_mulhi_epi16(x, y); + e = _mm256_mulhi_epi16(d, qvec); + return _mm256_sub_epi16(hi, e); +} + +static inline void vectormodq_swapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) { + __m256i f0vec = _mm256_set1_epi16(f0); + __m256i g0vec = _mm256_set1_epi16(g0); + __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec); + __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec); + __m256i maskvec = _mm256_set1_epi32(mask); + + while (len > 0) { + __m256i fi = _mm256_loadu_si256((__m256i *) f); + __m256i gi = _mm256_loadu_si256((__m256i *) g); + __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec); + __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec); + ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv)); + _mm256_storeu_si256((__m256i *) f, finew); + _mm256_storeu_si256((__m256i *) (g - 1), ginew); + f += 16; + g += 16; + len -= 16; + } +} + +static inline void vectormodq_xswapeliminate(Fq *f, Fq *g, int len, const Fq f0, const Fq g0, int mask) { + __m256i f0vec = _mm256_set1_epi16(f0); + __m256i g0vec = _mm256_set1_epi16(g0); + __m256i f0vecqinv = _mm256_mullo_epi16(f0vec, qinvvec); + __m256i g0vecqinv = _mm256_mullo_epi16(g0vec, qinvvec); + __m256i maskvec = _mm256_set1_epi32(mask); + + f += len + (-len & 15); + g += len + (-len & 15); + while (len > 0) { + f -= 16; + g -= 16; + len -= 16; + __m256i fi = _mm256_loadu_si256((__m256i *) f); + __m256i gi = _mm256_loadu_si256((__m256i *) g); + __m256i finew = _mm256_blendv_epi8(fi, gi, maskvec); + __m256i ginew = _mm256_blendv_epi8(gi, fi, maskvec); + ginew = _mm256_sub_epi16(montproduct(ginew, f0vec, f0vecqinv), montproduct(finew, g0vec, g0vecqinv)); + _mm256_storeu_si256((__m256i *) (f + 1), finew); + _mm256_storeu_si256((__m256i *) g, ginew); + } +} + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + small *in = (void *) inbytes; + int loop; + Fq out[p], f[ppad], g[ppad], v[ppad], r[ppad]; + Fq f0, g0; + Fq scale; + int i; + int delta = 1; + int minusdelta; + int fgflip; + int swap; + + for (i = 0; i < ppad; ++i) { + f[i] = 0; + } + f[0] = 1; + f[p - 1] = -1; + f[p] = -1; + /* generalization: initialize f to reversal of any deg-p polynomial m */ + + for (i = 0; i < p; ++i) { + g[i] = in[p - 1 - i]; + } + for (i = p; i < ppad; ++i) { + g[i] = 0; + } + + for (i = 0; i < ppad; ++i) { + r[i] = 0; + } + r[0] = Fq_recip(3); + + for (i = 0; i < ppad; ++i) { + v[i] = 0; + } + + for (loop = 0; loop < p; ++loop) { + g0 = Fq_freeze(g[0]); + f0 = f[0]; + if (q > 5167) { + f0 = Fq_freeze(f0); + } + + minusdelta = -delta; + swap = int16_negative_mask((int16) minusdelta) & int16_nonzero_mask(g0); + delta ^= swap & (delta ^ minusdelta); + delta += 1; + + fgflip = swap & (f0 ^ g0); + f0 ^= (Fq) fgflip; + g0 ^= (Fq) fgflip; + + f[0] = f0; + + vectormodq_swapeliminate(f + 1, g + 1, p, f0, g0, swap); + vectormodq_xswapeliminate(v, r, loop + 1, f0, g0, swap); + } + + for (loop = p - 1; loop > 0; --loop) { + g0 = Fq_freeze(g[0]); + f0 = f[0]; + if (q > 5167) { + f0 = Fq_freeze(f0); + } + + minusdelta = -delta; + swap = int16_negative_mask((int16) minusdelta) & int16_nonzero_mask(g0); + delta ^= swap & (delta ^ minusdelta); + delta += 1; + + fgflip = swap & (f0 ^ g0); + f0 ^= (Fq) fgflip; + g0 ^= (Fq) fgflip; + + f[0] = f0; + + vectormodq_swapeliminate(f + 1, g + 1, loop, f0, g0, swap); + vectormodq_xswapeliminate(v, r, p, f0, g0, swap); + } + + scale = Fq_recip(Fq_freeze(f[0])); + for (i = 0; i < p; ++i) { + out[i] = Fq_bigfreeze(scale * (int32)Fq_freeze(v[p - i])); + } + + crypto_encode_pxint16(outbytes, out); + outbytes[2 * p] = (unsigned char) int16_nonzero_mask((int16) delta); + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.h new file mode 100644 index 0000000000..1a2adf1602 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_invsntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INVSNTRUP761_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_INVSNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_OUTPUTBYTES 1523 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.c new file mode 100644 index 0000000000..b9213dcd55 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.c @@ -0,0 +1,258 @@ +#include "crypto_core_mult3sntrup761.h" +#include "crypto_core_multsntrup761_ntt.h" +#include "crypto_decode_761xint16.h" +#include "crypto_encode_761xint16.h" +#include + +typedef int8_t int8; +typedef int16_t int16; + +#define int16x16 __m256i +#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p)) +#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v)) +#define const_x16 _mm256_set1_epi16 +#define add_x16 _mm256_add_epi16 +#define sub_x16 _mm256_sub_epi16 +#define mullo_x16 _mm256_mullo_epi16 +#define mulhi_x16 _mm256_mulhi_epi16 +#define mulhrs_x16 _mm256_mulhrs_epi16 +#define signmask_x16(x) _mm256_srai_epi16((x),15) + +typedef union { + int16 v[6][512]; + int16x16 _dummy; +} vec6x512; + +typedef union { + int16 v[768]; + int16x16 _dummy; +} vec768; + +typedef union { + int16 v[3 * 512]; + int16x16 _dummy; +} vec1536; + +static int16x16 squeeze_3_x16(int16x16 x) { + return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(10923)), const_x16(3))); +} + +static int16x16 squeeze_7681_x16(int16x16 x) { + return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681))); +} + +static int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) { + int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */ + int16x16 b = mulhi_x16(x, y); + int16x16 d = mullo_x16(x, yqinv); + int16x16 e = mulhi_x16(d, const_x16(7681)); + return sub_x16(b, e); +} + +#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1) +#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0) +#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0) + +static void good(int16 fpad[3][512], const int16 f[768]) { + int j; + int16x16 f0, f1; + + j = 0; + for (;;) { + f0 = load_x16(f + j); + f1 = load_x16(f + 512 + j); + store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1)); + store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2)); + store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0)); + j += 16; + if (j == 256) { + break; + } + + f0 = load_x16(f + j); + f1 = load_x16(f + 512 + j); + store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0)); + store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1)); + store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2)); + j += 16; + + f0 = load_x16(f + j); + f1 = load_x16(f + 512 + j); + store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2)); + store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0)); + store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1)); + j += 16; + } + for (;;) { + f0 = load_x16(f + j); + store_x16(&fpad[0][j], f0 & mask2); + store_x16(&fpad[1][j], f0 & mask0); + store_x16(&fpad[2][j], f0 & mask1); + j += 16; + if (j == 512) { + break; + } + + f0 = load_x16(f + j); + store_x16(&fpad[0][j], f0 & mask1); + store_x16(&fpad[1][j], f0 & mask2); + store_x16(&fpad[2][j], f0 & mask0); + j += 16; + + f0 = load_x16(f + j); + store_x16(&fpad[0][j], f0 & mask0); + store_x16(&fpad[1][j], f0 & mask1); + store_x16(&fpad[2][j], f0 & mask2); + j += 16; + } +} + +static void ungood(int16 f[1536], const int16 fpad[3][512]) { + int j; + int16x16 f0, f1, f2, g0, g1, g2; + + j = 0; + + for (;;) { + f0 = load_x16(&fpad[0][j]); + f1 = load_x16(&fpad[1][j]); + f2 = load_x16(&fpad[2][j]); + g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); + g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); + g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */ + store_x16(f + 0 + j, g0); + store_x16(f + 512 + j, g1); + store_x16(f + 1024 + j, g2); + j += 16; + + f0 = load_x16(&fpad[0][j]); + f1 = load_x16(&fpad[1][j]); + f2 = load_x16(&fpad[2][j]); + g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); + g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); + g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */ + store_x16(f + 0 + j, g0); + store_x16(f + 512 + j, g1); + store_x16(f + 1024 + j, g2); + j += 16; + if (j == 512) { + break; + } + + f0 = load_x16(&fpad[0][j]); + f1 = load_x16(&fpad[1][j]); + f2 = load_x16(&fpad[2][j]); + g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); + g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); + g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */ + store_x16(f + 0 + j, g0); + store_x16(f + 512 + j, g1); + store_x16(f + 1024 + j, g2); + j += 16; + } +} + +static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) { + vec6x512 fgpad; +#define fpad (fgpad.v) +#define gpad (fgpad.v+3) +#define hpad fpad + vec1536 aligned_h_7681; +#define h_7681 (aligned_h_7681.v) + int i; + + good(fpad, f); + good(gpad, g); + + PQCLEAN_SNTRUP761_AVX2_ntt512_7681(fgpad.v[0], 6); + + for (i = 0; i < 512; i += 16) { + int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i])); + int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i])); + int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i])); + int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i])); + int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i])); + int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i])); + int16x16 d0 = mulmod_7681_x16(f0, g0); + int16x16 d1 = mulmod_7681_x16(f1, g1); + int16x16 d2 = mulmod_7681_x16(f2, g2); + int16x16 dsum = add_x16(add_x16(d0, d1), d2); + int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2))); + int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1))); + int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0))); + store_x16(&hpad[0][i], squeeze_7681_x16(h0)); + store_x16(&hpad[1][i], squeeze_7681_x16(h1)); + store_x16(&hpad[2][i], squeeze_7681_x16(h2)); + } + + PQCLEAN_SNTRUP761_AVX2_invntt512_7681(hpad[0], 3); + ungood(h_7681, (const int16(*)[512]) hpad); + + for (i = 0; i < 1536; i += 16) { + int16x16 u = load_x16(&h_7681[i]); + u = mulmod_7681_x16(u, const_x16(956)); + store_x16(&h[i], u); + } +} + +#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16 +#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16 + +#define p 761 + +static inline int16x16 freeze_3_x16(int16x16 x) { + int16x16 mask, x3; + x = add_x16(x, const_x16(3)&signmask_x16(x)); + mask = signmask_x16(sub_x16(x, const_x16(2))); + x3 = sub_x16(x, const_x16(3)); + x = _mm256_blendv_epi8(x3, x, mask); + return x; +} + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) { + vec768 x1, x2; + vec1536 x3; +#define f (x1.v) +#define g (x2.v) +#define fg (x3.v) +#define h f + int i; + int16x16 x; + + x = const_x16(0); + for (i = p & ~15; i < 768; i += 16) { + store_x16(&f[i], x); + } + for (i = p & ~15; i < 768; i += 16) { + store_x16(&g[i], x); + } + + for (i = 0; i < p; ++i) { + int8 fi = (int8) inbytes[i]; + int8 fi0 = fi & 1; + f[i] = (int16) (fi0 - (fi & (fi0 << 1))); + } + for (i = 0; i < p; ++i) { + int8 gi = (int8) kbytes[i]; + int8 gi0 = gi & 1; + g[i] = (int16) (gi0 - (gi & (gi0 << 1))); + } + + mult768(fg, f, g); + + fg[0] = (int16) (fg[0] - fg[p - 1]); + for (i = 0; i < 768; i += 16) { + int16x16 fgi = load_x16(&fg[i]); + int16x16 fgip = load_x16(&fg[i + p]); + int16x16 fgip1 = load_x16(&fg[i + p - 1]); + x = add_x16(fgi, add_x16(fgip, fgip1)); + x = freeze_3_x16(squeeze_3_x16(x)); + store_x16(&h[i], x); + } + + for (i = 0; i < p; ++i) { + outbytes[i] = (unsigned char) h[i]; + } + + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.h new file mode 100644 index 0000000000..051fd59078 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_mult3sntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULT3SNTRUP761_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULT3SNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_OUTPUTBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_KEYBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.c new file mode 100644 index 0000000000..5bb2e445ea --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.c @@ -0,0 +1,313 @@ +#include "crypto_core_multsntrup761.h" +#include "crypto_core_multsntrup761_ntt.h" +#include "crypto_decode_761xint16.h" +#include "crypto_encode_761xint16.h" +#include + +typedef int8_t int8; +typedef int16_t int16; + +#define int16x16 __m256i +#define load_x16(p) _mm256_loadu_si256((int16x16 *) (p)) +#define store_x16(p,v) _mm256_storeu_si256((int16x16 *) (p),(v)) +#define const_x16 _mm256_set1_epi16 +#define add_x16 _mm256_add_epi16 +#define sub_x16 _mm256_sub_epi16 +#define mullo_x16 _mm256_mullo_epi16 +#define mulhi_x16 _mm256_mulhi_epi16 +#define mulhrs_x16 _mm256_mulhrs_epi16 +#define signmask_x16(x) _mm256_srai_epi16((x),15) + +typedef union { + int16 v[6][512]; + int16x16 _dummy; +} vec6x512; + +typedef union { + int16 v[768]; + int16x16 _dummy; +} vec768; + +typedef union { + int16 v[3 * 512]; + int16x16 _dummy; +} vec1536; + +static inline int16x16 squeeze_4591_x16(int16x16 x) { + return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(7)), const_x16(4591))); +} + +static inline int16x16 squeeze_7681_x16(int16x16 x) { + return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(4)), const_x16(7681))); +} + +static inline int16x16 squeeze_10753_x16(int16x16 x) { + return sub_x16(x, mullo_x16(mulhrs_x16(x, const_x16(3)), const_x16(10753))); +} + +static inline int16x16 mulmod_4591_x16(int16x16 x, int16x16 y) { + int16x16 yqinv = mullo_x16(y, const_x16(15631)); /* XXX: precompute */ + int16x16 b = mulhi_x16(x, y); + int16x16 d = mullo_x16(x, yqinv); + int16x16 e = mulhi_x16(d, const_x16(4591)); + return sub_x16(b, e); +} + +static inline int16x16 mulmod_7681_x16(int16x16 x, int16x16 y) { + int16x16 yqinv = mullo_x16(y, const_x16(-7679)); /* XXX: precompute */ + int16x16 b = mulhi_x16(x, y); + int16x16 d = mullo_x16(x, yqinv); + int16x16 e = mulhi_x16(d, const_x16(7681)); + return sub_x16(b, e); +} + +static inline int16x16 mulmod_10753_x16(int16x16 x, int16x16 y) { + int16x16 yqinv = mullo_x16(y, const_x16(-10751)); /* XXX: precompute */ + int16x16 b = mulhi_x16(x, y); + int16x16 d = mullo_x16(x, yqinv); + int16x16 e = mulhi_x16(d, const_x16(10753)); + return sub_x16(b, e); +} + +#define mask0 _mm256_set_epi16(-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1) +#define mask1 _mm256_set_epi16(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0) +#define mask2 _mm256_set_epi16(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0) + +static void good(int16 fpad[3][512], const int16 f[768]) { + int j; + int16x16 f0, f1; + + j = 0; + for (;;) { + f0 = load_x16(f + j); + f1 = load_x16(f + 512 + j); + store_x16(&fpad[0][j], (f0 & mask0) | (f1 & mask1)); + store_x16(&fpad[1][j], (f0 & mask1) | (f1 & mask2)); + store_x16(&fpad[2][j], (f0 & mask2) | (f1 & mask0)); + j += 16; + if (j == 256) { + break; + } + + f0 = load_x16(f + j); + f1 = load_x16(f + 512 + j); + store_x16(&fpad[0][j], (f0 & mask2) | (f1 & mask0)); + store_x16(&fpad[1][j], (f0 & mask0) | (f1 & mask1)); + store_x16(&fpad[2][j], (f0 & mask1) | (f1 & mask2)); + j += 16; + + f0 = load_x16(f + j); + f1 = load_x16(f + 512 + j); + store_x16(&fpad[0][j], (f0 & mask1) | (f1 & mask2)); + store_x16(&fpad[1][j], (f0 & mask2) | (f1 & mask0)); + store_x16(&fpad[2][j], (f0 & mask0) | (f1 & mask1)); + j += 16; + } + for (;;) { + f0 = load_x16(f + j); + store_x16(&fpad[0][j], f0 & mask2); + store_x16(&fpad[1][j], f0 & mask0); + store_x16(&fpad[2][j], f0 & mask1); + j += 16; + if (j == 512) { + break; + } + + f0 = load_x16(f + j); + store_x16(&fpad[0][j], f0 & mask1); + store_x16(&fpad[1][j], f0 & mask2); + store_x16(&fpad[2][j], f0 & mask0); + j += 16; + + f0 = load_x16(f + j); + store_x16(&fpad[0][j], f0 & mask0); + store_x16(&fpad[1][j], f0 & mask1); + store_x16(&fpad[2][j], f0 & mask2); + j += 16; + } +} + +static void ungood(int16 f[1536], const int16 fpad[3][512]) { + int j; + int16x16 f0, f1, f2, g0, g1, g2; + + j = 0; + + for (;;) { + f0 = load_x16(&fpad[0][j]); + f1 = load_x16(&fpad[1][j]); + f2 = load_x16(&fpad[2][j]); + g0 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); + g1 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); + g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask2)|(f1&mask0)|(f2&mask1) */ + store_x16(f + 0 + j, g0); + store_x16(f + 512 + j, g1); + store_x16(f + 1024 + j, g2); + j += 16; + + f0 = load_x16(&fpad[0][j]); + f1 = load_x16(&fpad[1][j]); + f2 = load_x16(&fpad[2][j]); + g0 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); + g1 = (f0 & mask0) | (f1 & mask1) | (f2 & mask2); + g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask1)|(f1&mask2)|(f2&mask0) */ + store_x16(f + 0 + j, g0); + store_x16(f + 512 + j, g1); + store_x16(f + 1024 + j, g2); + j += 16; + if (j == 512) { + break; + } + + f0 = load_x16(&fpad[0][j]); + f1 = load_x16(&fpad[1][j]); + f2 = load_x16(&fpad[2][j]); + g0 = (f0 & mask1) | (f1 & mask2) | (f2 & mask0); + g1 = (f0 & mask2) | (f1 & mask0) | (f2 & mask1); + g2 = f0 ^ f1 ^ f2 ^ g0 ^ g1; /* same as (f0&mask0)|(f1&mask1)|(f2&mask2) */ + store_x16(f + 0 + j, g0); + store_x16(f + 512 + j, g1); + store_x16(f + 1024 + j, g2); + j += 16; + } +} + +static void mult768(int16 h[1536], const int16 f[768], const int16 g[768]) { + vec6x512 fgpad; +#define fpad (fgpad.v) +#define gpad (fgpad.v+3) +#define hpad fpad + vec1536 aligned_h_7681; + vec1536 aligned_h_10753; +#define h_7681 (aligned_h_7681.v) +#define h_10753 (aligned_h_10753.v) + int i; + + good(fpad, f); + good(gpad, g); + + PQCLEAN_SNTRUP761_AVX2_ntt512_7681(fgpad.v[0], 6); + + for (i = 0; i < 512; i += 16) { + int16x16 f0 = squeeze_7681_x16(load_x16(&fpad[0][i])); + int16x16 f1 = squeeze_7681_x16(load_x16(&fpad[1][i])); + int16x16 f2 = squeeze_7681_x16(load_x16(&fpad[2][i])); + int16x16 g0 = squeeze_7681_x16(load_x16(&gpad[0][i])); + int16x16 g1 = squeeze_7681_x16(load_x16(&gpad[1][i])); + int16x16 g2 = squeeze_7681_x16(load_x16(&gpad[2][i])); + int16x16 d0 = mulmod_7681_x16(f0, g0); + int16x16 d1 = mulmod_7681_x16(f1, g1); + int16x16 d2 = mulmod_7681_x16(f2, g2); + int16x16 dsum = add_x16(add_x16(d0, d1), d2); + int16x16 h0 = add_x16(dsum, mulmod_7681_x16(sub_x16(f2, f1), sub_x16(g1, g2))); + int16x16 h1 = add_x16(dsum, mulmod_7681_x16(sub_x16(f1, f0), sub_x16(g0, g1))); + int16x16 h2 = add_x16(dsum, mulmod_7681_x16(sub_x16(f0, f2), sub_x16(g2, g0))); + store_x16(&hpad[0][i], squeeze_7681_x16(h0)); + store_x16(&hpad[1][i], squeeze_7681_x16(h1)); + store_x16(&hpad[2][i], squeeze_7681_x16(h2)); + } + + PQCLEAN_SNTRUP761_AVX2_invntt512_7681(hpad[0], 3); + ungood(h_7681, (const int16(*)[512]) hpad); + + good(fpad, f); + good(gpad, g); + + PQCLEAN_SNTRUP761_AVX2_ntt512_10753(fgpad.v[0], 6); + + for (i = 0; i < 512; i += 16) { + int16x16 f0 = squeeze_10753_x16(load_x16(&fpad[0][i])); + int16x16 f1 = squeeze_10753_x16(load_x16(&fpad[1][i])); + int16x16 f2 = squeeze_10753_x16(load_x16(&fpad[2][i])); + int16x16 g0 = squeeze_10753_x16(load_x16(&gpad[0][i])); + int16x16 g1 = squeeze_10753_x16(load_x16(&gpad[1][i])); + int16x16 g2 = squeeze_10753_x16(load_x16(&gpad[2][i])); + int16x16 d0 = mulmod_10753_x16(f0, g0); + int16x16 d1 = mulmod_10753_x16(f1, g1); + int16x16 d2 = mulmod_10753_x16(f2, g2); + int16x16 dsum = add_x16(add_x16(d0, d1), d2); + int16x16 h0 = add_x16(dsum, mulmod_10753_x16(sub_x16(f2, f1), sub_x16(g1, g2))); + int16x16 h1 = add_x16(dsum, mulmod_10753_x16(sub_x16(f1, f0), sub_x16(g0, g1))); + int16x16 h2 = add_x16(dsum, mulmod_10753_x16(sub_x16(f0, f2), sub_x16(g2, g0))); + store_x16(&hpad[0][i], squeeze_10753_x16(h0)); + store_x16(&hpad[1][i], squeeze_10753_x16(h1)); + store_x16(&hpad[2][i], squeeze_10753_x16(h2)); + } + + PQCLEAN_SNTRUP761_AVX2_invntt512_10753(hpad[0], 3); + ungood(h_10753, (const int16(*)[512]) hpad); + + for (i = 0; i < 1536; i += 16) { + int16x16 u1 = load_x16(&h_10753[i]); + int16x16 u2 = load_x16(&h_7681[i]); + int16x16 t; + u1 = mulmod_10753_x16(u1, const_x16(1268)); + u2 = mulmod_7681_x16(u2, const_x16(956)); + t = mulmod_7681_x16(sub_x16(u2, u1), const_x16(-2539)); + t = add_x16(u1, mulmod_4591_x16(t, const_x16(-710))); + store_x16(&h[i], t); + } +} + +#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16 +#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16 + +#define p 761 +#define q 4591 + +static inline int16x16 freeze_4591_x16(int16x16 x) { + int16x16 mask, xq; + x = add_x16(x, const_x16(q)&signmask_x16(x)); + mask = signmask_x16(sub_x16(x, const_x16((q + 1) / 2))); + xq = sub_x16(x, const_x16(q)); + x = _mm256_blendv_epi8(xq, x, mask); + return x; +} + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) { + vec768 x1, x2; + vec1536 x3; +#define f (x1.v) +#define g (x2.v) +#define fg (x3.v) +#define h f + int i; + int16x16 x; + + x = const_x16(0); + for (i = p & ~15; i < 768; i += 16) { + store_x16(&f[i], x); + } + for (i = p & ~15; i < 768; i += 16) { + store_x16(&g[i], x); + } + + crypto_decode_pxint16(f, inbytes); + + for (i = 0; i < 768; i += 16) { + x = load_x16(&f[i]); + x = freeze_4591_x16(squeeze_4591_x16(x)); + store_x16(&f[i], x); + } + for (i = 0; i < p; ++i) { + int8 gi = (int8) kbytes[i]; + int8 gi0 = gi & 1; + g[i] = (int16) (gi0 - (gi & (gi0 << 1))); + } + + mult768(fg, f, g); + + fg[0] = (int16) (fg[0] - fg[p - 1]); + for (i = 0; i < 768; i += 16) { + int16x16 fgi = load_x16(&fg[i]); + int16x16 fgip = load_x16(&fg[i + p]); + int16x16 fgip1 = load_x16(&fg[i + p - 1]); + x = add_x16(fgi, add_x16(fgip, fgip1)); + x = freeze_4591_x16(squeeze_4591_x16(x)); + store_x16(&h[i], x); + } + + crypto_encode_pxint16(outbytes, h); + + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.h new file mode 100644 index 0000000000..846aea2eca --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_MULTSNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_OUTPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_INPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_KEYBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.c new file mode 100644 index 0000000000..689062e2b6 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.c @@ -0,0 +1,2175 @@ +#include "crypto_core_multsntrup761_ntt.h" +#include + +// auto-generated; do not edit + + +#define _mm256_permute2x128_si256_lo(f0,f1) _mm256_permute2x128_si256(f0,f1,0x20) +#define _mm256_permute2x128_si256_hi(f0,f1) _mm256_permute2x128_si256(f0,f1,0x31) +#define int16x16 __m256i + +typedef int16_t int16; +typedef int32_t int32; + +typedef union { + int16 data[106 * 16]; + __m256i _dummy; +} vec1696; + +static const vec1696 qdata_7681 = { .data = { +#define precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+0) + -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625, -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625, +#define precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+16) + -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182, -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182, +#define precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+32) + -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182, -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182, +#define precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+48) + 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625, 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625, +#define precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+64) + -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194, -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194, +#define precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+80) + -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100, -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100, +#define precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+96) + -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696, -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696, +#define precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+112) + -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456, -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456, +#define precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+128) + -3593, 1701, 2194, 834, -3625, 2319, -1100, 121, -3593, 1701, 2194, 834, -3625, 2319, -1100, 121, +#define precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+144) + -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250, -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250, +#define precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+160) + -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414, -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414, +#define precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+176) + 3777, -121, 1100, -2319, 3625, -834, -2194, -1701, 3777, -121, 1100, -2319, 3625, -834, -2194, -1701, +#define precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+192) + -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816, -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816, +#define precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+208) + -3625, 617, 2319, 2006, -1100, -1296, 121, 1986, -3625, 617, 2319, 2006, -1100, -1296, 121, 1986, +#define precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+224) + -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921, -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921, +#define precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+240) + -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830, -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830, +#define precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+256) + -3593, 514, 3364, 438, 1701, 2555, -1599, -1738, 2194, 103, 2557, 1881, 834, -549, -2816, 638, +#define precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+272) + -3625, -1399, 617, -1760, 2319, 2535, 2006, 3266, -1100, -1431, -1296, 3174, 121, 3153, 1986, -810, +#define precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+288) + -3777, 2956, -2830, -679, 1414, 2440, -1993, -3689, 2456, 2804, 1525, 3555, 2495, 1535, -2088, -7, +#define precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+304) + 3182, -1321, -1921, -1305, 2876, -3772, -3706, 3600, -3696, -2043, 1483, -396, 2250, -2310, -2237, 1887, +#define precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+320) + -3593, -1887, 2237, 2310, -2250, 396, -1483, 2043, 3696, -3600, 3706, 3772, -2876, 1305, 1921, 1321, +#define precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+336) + -3182, 7, 2088, -1535, -2495, -3555, -1525, -2804, -2456, 3689, 1993, -2440, -1414, 679, 2830, -2956, +#define precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+352) + 3777, 810, -1986, -3153, -121, -3174, 1296, 1431, 1100, -3266, -2006, -2535, -2319, 1760, -617, 1399, +#define precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+368) + 3625, -638, 2816, 549, -834, -1881, -2557, -103, -2194, 1738, 1599, -2555, -1701, -438, -3364, -514, +#define precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+384) + -3593, -1532, 514, -373, 3364, -3816, 438, -3456, 1701, 783, 2555, 2883, -1599, 727, -1738, -2385, +#define precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+400) + 2194, -2160, 103, -2391, 2557, 2762, 1881, -2426, 834, 3310, -549, -1350, -2816, 1386, 638, -194, +#define precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+416) + -3625, 404, -1399, -3692, 617, -2764, -1760, -1054, 2319, 1799, 2535, -3588, 2006, 1533, 3266, 2113, +#define precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+432) + -1100, -2579, -1431, -1756, -1296, 1598, 3174, -2, 121, -3480, 3153, -2572, 1986, 2743, -810, 2919, +#define precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+448) + -3593, 2789, -1887, -921, 2237, -1497, 2310, -2133, -2250, -915, 396, 1390, -1483, 3135, 2043, -859, +#define precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+464) + 3696, 2732, -3600, -1464, 3706, 2224, 3772, -2665, -2876, 1698, 1305, 2835, 1921, 730, 1321, 486, +#define precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+480) + -3182, 3417, 7, -3428, 2088, -3145, -1535, 1168, -2495, -3831, -3555, -3750, -1525, 660, -2804, 2649, +#define precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+496) + -2456, 3405, 3689, -1521, 1993, 1681, -2440, 1056, -1414, 1166, 679, -2233, 2830, 2175, -2956, -1919, +#define precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+512) + -3593, -1404, -1532, 451, 514, -402, -373, 1278, 3364, -509, -3816, -3770, 438, -2345, -3456, -226, +#define precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+528) + 1701, -1689, 783, -1509, 2555, 2963, 2883, 1242, -1599, 1669, 727, 2719, -1738, 642, -2385, -436, +#define precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+544) + 2194, 3335, -2160, 1779, 103, 3745, -2391, 17, 2557, 2812, 2762, -1144, 1881, 83, -2426, -1181, +#define precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+560) + 834, -1519, 3310, 3568, -549, -796, -1350, 2072, -2816, -2460, 1386, 2891, 638, -2083, -194, -715, +#define precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+576) + -3593, -402, -3816, -226, 2555, 1669, -2385, 1779, 2557, 83, 3310, 2072, 638, 1012, -3692, 1295, +#define precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+592) + 2319, -3208, 1533, -2071, -1431, -2005, -2, 1586, 1986, -293, 1919, -929, -679, 777, -1681, -3461, +#define precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+608) + 2456, 3366, 3750, -1203, 1535, -3657, -3417, -1712, -1921, 2515, 2665, -1070, 3600, 2532, -3135, -2589, +#define precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+624) + 2250, -2258, 921, -658, -514, 509, 3456, 1509, 1599, -642, 2160, -17, -1881, 1519, 1350, -2891, +#define precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+640) + -3593, -3434, -1497, 893, 396, -2422, -859, 2965, 3706, -2339, 1698, -2937, 1321, -670, -3428, -3163, +#define precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+656) + -2495, -1072, 660, 1084, 3689, -179, 1056, -1338, 2830, 2786, -2919, -3677, -3153, -151, -1598, 3334, +#define precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+672) + 1100, -3314, 3588, 2262, 1760, -2230, -404, 2083, 2816, -3568, 2426, -2812, -103, 436, -727, -2963, +#define precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+688) + -1701, 3770, 373, 1404, 1887, -1649, 2133, -826, 1483, 434, -2732, 3287, -3772, -2378, -2835, 3723, +#define precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+704) + -3593, 658, 2789, 370, -1887, -3434, -921, -3752, 2237, 1649, -1497, 2258, 2310, 3581, -2133, 893, +#define precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+720) + -2250, 3794, -915, 826, 396, 2589, 1390, 592, -1483, -2422, 3135, 3214, 2043, -434, -859, -2532, +#define precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+736) + 3696, 1121, 2732, 2965, -3600, 2998, -1464, -3287, 3706, 1070, 2224, -589, 3772, -2339, -2665, 2070, +#define precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+752) + -2876, 2378, 1698, -2515, 1305, -2815, 2835, -2937, 1921, -1348, 730, -3723, 1321, 1712, 486, 2130, +#define q_x16 *(const int16x16 *)(qdata+768) + 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, +#define qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+784) + -9, -9, -9, -9, -16425, -16425, -16425, -16425, -9, -9, -9, -9, -16425, -16425, -16425, -16425, +#define qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+800) + -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350, -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350, +#define qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+816) + -9, -9, -9, -9, -10350, -10350, -10350, -10350, -9, -9, -9, -9, -10350, -10350, -10350, -10350, +#define qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+832) + 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425, 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425, +#define qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+848) + -9, -9, -9, -9, -4974, -4974, -4974, -4974, -9, -9, -9, -9, -4974, -4974, -4974, -4974, +#define qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+864) + -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244, -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244, +#define qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+880) + -9, -9, -9, -9, -4496, -4496, -4496, -4496, -9, -9, -9, -9, -4496, -4496, -4496, -4496, +#define qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+896) + -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744, -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744, +#define qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+912) + -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655, -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655, +#define qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+928) + -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754, -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754, +#define qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+944) + -9, -23754, -4496, -828, -10350, 22593, -14744, -20870, -9, -23754, -4496, -828, -10350, 22593, -14744, -20870, +#define qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+960) + 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315, 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315, +#define qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+976) + -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816, -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816, +#define qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+992) + -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394, -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394, +#define qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+1008) + -9, -7491, -23754, -15307, -4496, -15750, -828, -5759, -9, -7491, -23754, -15307, -4496, -15750, -828, -5759, +#define qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1024) + -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382, -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382, +#define qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1040) + -9, -14846, -10972, -21066, -20315, -24581, 23489, -23242, -4974, -4505, 25597, -26279, 18242, 21467, -2816, 15998, +#define qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1056) + -16425, -4983, -19351, 14624, 18191, -2073, -3114, 20674, -7244, -21399, -9488, 6246, -11655, -29103, 19394, -5930, +#define qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1072) + -28865, -23668, -26382, -28839, 20870, 6536, -31177, 16279, 14744, 29428, 20469, 29667, -22593, 9215, -22568, -11783, +#define qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1088) + 10350, -14121, 5759, -5913, 828, -1724, 15750, 11792, 4496, 25093, 15307, 26228, 23754, -21766, 7491, -6817, +#define qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1104) + -9, 6817, -7491, 21766, -23754, -26228, -15307, -25093, -4496, -11792, -15750, 1724, -828, 5913, -5759, 14121, +#define qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1120) + -10350, 11783, 22568, -9215, 22593, -29667, -20469, -29428, -14744, -16279, 31177, -6536, -20870, 28839, 26382, 23668, +#define qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1136) + 28865, 5930, -19394, 29103, 11655, -6246, 9488, 21399, 7244, -20674, 3114, 2073, -18191, -14624, 19351, 4983, +#define qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1152) + 16425, -15998, 2816, -21467, -18242, 26279, -25597, 4505, 4974, 23242, -23489, 24581, 20315, 21066, 10972, 14846, +#define qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1168) + -9, -32252, -14846, -19317, -10972, 8472, -21066, -3456, -20315, 16655, -24581, 12611, 23489, -12073, -23242, 29871, +#define qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1184) + -4974, 6032, -4505, 10409, 25597, 24266, -26279, 17030, 18242, 10478, 21467, 11962, -2816, -26262, 15998, -17602, +#define qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1200) + -16425, -22124, -4983, -26220, -19351, -8908, 14624, 32738, 18191, 13575, -2073, 27132, -3114, 24573, 20674, 27201, +#define qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1216) + -7244, 12269, -21399, -16092, -9488, -15810, 6246, 15358, -11655, -15768, -29103, 24052, 19394, -26441, -5930, -1689, +#define qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1232) + -9, 13541, 6817, -5529, -7491, 26663, 21766, -4693, -23754, 13933, -26228, 8558, -15307, -21953, -25093, -22875, +#define qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1248) + -4496, -7508, -11792, -30136, -15750, 26800, 1724, 17303, -828, 2722, 5913, -12013, -5759, 30426, 14121, 3558, +#define qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1264) + -10350, -24743, 11783, -21860, 22568, -32329, -9215, 9360, 22593, -7415, -29667, 25946, -20469, -21868, -29428, -25511, +#define qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1280) + -14744, 1869, -16279, 14351, 31177, 2193, -6536, 17440, -20870, 24718, 28839, -23225, 26382, 9855, 23668, -9599, +#define qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1296) + -9, -32124, -32252, 10179, -14846, 6766, -19317, 16638, -10972, -23549, 8472, -17082, -21066, -15145, -3456, 31518, +#define qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1312) + -20315, -6297, 16655, -12261, -24581, -11885, 12611, 30938, 23489, 28805, -12073, 26783, -23242, -14718, 29871, 5708, +#define qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1328) + -4974, 15111, 6032, -29453, -4505, 12449, 10409, 529, 25597, -32004, 24266, 2952, -26279, 18003, 17030, 24931, +#define qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1344) + 18242, -1007, 10478, -4624, 21467, 17636, 11962, 14360, -2816, 15972, -26262, 16715, 15998, 4573, -17602, -14539, +#define qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1360) + -9, 6766, 8472, 31518, -24581, 28805, 29871, -29453, 25597, 18003, 10478, 14360, 15998, 27636, -26220, 17167, +#define qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1376) + 18191, -7304, 24573, -22039, -21399, -4565, 15358, 10802, 19394, 21723, 9599, -9633, -28839, -2807, -2193, -30597, +#define qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1392) + 14744, -26330, -25946, -2739, 9215, 32695, 24743, -26288, 5759, 20435, -17303, 24530, 11792, 20964, 21953, 23523, +#define qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1408) + 23754, -27858, 5529, 6510, 14846, 23549, 3456, 12261, -23489, 14718, -6032, -529, 26279, 1007, -11962, -16715, +#define qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1424) + -9, 24214, 26663, 23933, -26228, -13686, -22875, -27243, -15750, 4317, 2722, 8839, 14121, -32414, -21860, -25179, +#define qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1440) + 22593, -25648, -21868, -964, -16279, -1715, 17440, -14650, 26382, -28958, 1689, -10333, 29103, -20119, 15810, 22790, +#define qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1456) + 7244, 20238, -27132, -2858, -14624, 19274, 22124, -4573, 2816, 4624, -17030, 32004, 4505, -5708, 12073, 11885, +#define qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1472) + 20315, 17082, 19317, 32124, -6817, 14223, 4693, -14138, 15307, 9650, 7508, -9513, -1724, -23882, 12013, -15221, +#define qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1488) + -9, -6510, 13541, -23182, 6817, 24214, -5529, -24232, -7491, -14223, 26663, 27858, 21766, 26621, -4693, 23933, +#define qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1504) + -23754, 29394, 13933, 14138, -26228, -23523, 8558, -23984, -15307, -13686, -21953, 26766, -25093, -9650, -22875, -20964, +#define qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1520) + -4496, -22943, -7508, -27243, -11792, -18506, -30136, 9513, -15750, -24530, 26800, 947, 1724, 4317, 17303, 29718, +#define qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1536) + -828, 23882, 2722, -20435, 5913, -10495, -12013, 8839, -5759, -3396, 30426, 15221, 14121, 26288, 3558, 27730, +#define qinvscaledzeta_x16_4_1 *(const int16x16 *)(qdata+1552) + -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, +#define qinvscaledzeta_x16_4_3 *(const int16x16 *)(qdata+1568) + 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, +#define qinvscaledzeta_x16_8_1 *(const int16x16 *)(qdata+1584) + -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, +#define qinvscaledzeta_x16_8_7 *(const int16x16 *)(qdata+1600) + -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, +#define qround32_x16 *(const int16x16 *)(qdata+1616) + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +#define scaledzeta_x16_4_1 *(const int16x16 *)(qdata+1632) + -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, +#define scaledzeta_x16_4_3 *(const int16x16 *)(qdata+1648) + 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, +#define scaledzeta_x16_8_1 *(const int16x16 *)(qdata+1664) + -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, +#define scaledzeta_x16_8_7 *(const int16x16 *)(qdata+1680) + -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, + } +} ; + +static const vec1696 qdata_10753 = { .data = { + // precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688, 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688, + // precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + -223, -223, -223, -223, -4188, -4188, -4188, -4188, -223, -223, -223, -223, -4188, -4188, -4188, -4188, + // precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188, 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188, + // precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + 223, 223, 223, 223, -3688, -3688, -3688, -3688, 223, 223, 223, 223, -3688, -3688, -3688, -3688, + // precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + 1018, 1018, 1018, 1018, -376, -376, -376, -376, 1018, 1018, 1018, 1018, -376, -376, -376, -376, + // precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686, 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686, + // precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413, 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413, + // precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + 4188, 4188, 4188, 4188, -357, -357, -357, -357, 4188, 4188, 4188, 4188, -357, -357, -357, -357, + // precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + 1018, -3364, -376, 4855, 3688, 425, -3686, 2695, 1018, -3364, -376, 4855, 3688, 425, -3686, 2695, + // precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + -223, -3784, 357, -2236, -4188, 4544, 2413, 730, -223, -3784, 357, -2236, -4188, 4544, 2413, 730, + // precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + 1018, -730, -2413, -4544, 4188, 2236, -357, 3784, 1018, -730, -2413, -4544, 4188, 2236, -357, 3784, + // precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + 223, -2695, 3686, -425, -3688, -4855, 376, 3364, 223, -2695, 3686, -425, -3688, -4855, 376, 3364, + // precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875, 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875, + // precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035, 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035, + // precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + 1018, 5063, -730, 341, -2413, -3012, -4544, -5213, 1018, 5063, -730, 341, -2413, -3012, -4544, -5213, + // precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + 4188, 1520, 2236, 1931, -357, 918, 3784, 4095, 4188, 1520, 2236, 1931, -357, 918, 3784, 4095, + // precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, 3085, -5175, 2982, -3364, -4744, 2503, -4129, -376, -2576, 1341, -193, 4855, 3062, -4875, 4, + // precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + 3688, 2388, -2629, -4513, 425, 4742, -4347, 2935, -3686, -544, 3823, -2178, 2695, 847, -4035, 268, + // precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -223, -1299, -4095, -1287, -3784, -4876, -918, 3091, 357, -4189, -1931, 4616, -2236, 2984, -1520, -3550, + // precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -4188, -1009, 5213, -205, 4544, -4102, 3012, 2790, 2413, -1085, -341, -2565, 730, -4379, -5063, -1284, + // precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, 1284, 5063, 4379, -730, 2565, 341, 1085, -2413, -2790, -3012, 4102, -4544, 205, -5213, 1009, + // precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + 4188, 3550, 1520, -2984, 2236, -4616, 1931, 4189, -357, -3091, 918, 4876, 3784, 1287, 4095, 1299, + // precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 223, -268, 4035, -847, -2695, 2178, -3823, 544, 3686, -2935, 4347, -4742, -425, 4513, 2629, -2388, + // precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -3688, -4, 4875, -3062, -4855, 193, -1341, 2576, 376, 4129, -2503, 4744, 3364, -2982, 5175, -3085, + // precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, 5116, 3085, -3615, -5175, 400, 2982, 3198, -3364, 2234, -4744, -4828, 2503, 326, -4129, -512, + // precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -376, 1068, -2576, -4580, 1341, 3169, -193, -2998, 4855, -635, 3062, -4808, -4875, -2740, 4, 675, + // precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 3688, -1324, 2388, 5114, -2629, 5294, -4513, -794, 425, -864, 4742, -886, -4347, 336, 2935, -2045, + // precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -3686, -3715, -544, 4977, 3823, -2737, -2178, 3441, 2695, 467, 847, 454, -4035, -779, 268, 2213, + // precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, 1615, 1284, 2206, 5063, 5064, 4379, 472, -730, -5341, 2565, -4286, 341, 2981, 1085, -1268, + // precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -2413, -3057, -2790, -2884, -3012, -1356, 4102, -3337, -4544, 5023, 205, -636, -5213, 909, 1009, -2973, + // precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 4188, 2271, 3550, -1572, 1520, 1841, -2984, 970, 2236, -4734, -4616, 578, 1931, -116, 4189, 1586, + // precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -357, -2774, -3091, -1006, 918, -5156, 4876, 4123, 3784, -567, 1287, 151, 4095, 1458, 1299, 2684, + // precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, -3260, 5116, -1722, 3085, 5120, -3615, 3760, -5175, 73, 400, 4254, 2982, 2788, 3198, -2657, + // precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -3364, 569, 2234, 1930, -4744, -2279, -4828, 5215, 2503, -4403, 326, 1639, -4129, 5068, -512, -5015, + // precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -376, -4859, 1068, -40, -2576, 4003, -4580, -4621, 1341, 2487, 3169, -2374, -193, 2625, -2998, 4784, + // precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 4855, 825, -635, 2118, 3062, -2813, -4808, -4250, -4875, -2113, -2740, -4408, 4, -1893, 675, 458, + // precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, 5120, 400, -2657, -4744, -4403, -512, -40, 1341, 2625, -635, -4250, 4, -3360, 5114, -5313, + // precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + 425, -2151, 336, -2662, -544, 5334, 3441, 2117, -4035, 2205, -2684, -3570, -1287, -4973, 5156, 2419, + // precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 357, 1204, -578, 1635, 2984, -1111, -2271, 4359, 5213, -2449, 3337, 3453, 2790, 554, -2981, -1409, + // precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 730, -279, -2206, 3524, -3085, -73, -3198, -1930, -2503, -5068, -1068, 4621, 193, -825, 4808, 4408, + // precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, 4428, 5064, -4000, 2565, 573, -1268, 3125, -3012, -4144, 5023, 1927, 1009, -2139, -1572, 3535, + // precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + 2236, 663, -116, 4967, -3091, -854, 4123, 1160, 4095, -1349, -2213, 1782, -847, 2062, 2737, 624, + // precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 3686, -2283, 886, 4889, 4513, -4601, 1324, 1893, 4875, -2118, 2998, -2487, 2576, 5015, -326, 2279, + // precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 3364, -4254, 3615, 3260, -1284, -1381, -472, -3891, -341, 2087, 3057, 4720, -4102, 3410, 636, 1689, + // precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + 1018, -3524, 1615, 5268, 1284, 4428, 2206, -834, 5063, 1381, 5064, 279, 4379, 2439, 472, -4000, + // precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -730, -2015, -5341, 3891, 2565, 1409, -4286, 2605, 341, 573, 2981, 5356, 1085, -2087, -1268, -554, + // precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -2413, 3135, -3057, 3125, -2790, -778, -2884, -4720, -3012, -3453, -1356, -355, 4102, -4144, -3337, -152, + // precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -4544, -3410, 5023, 2449, 205, -97, -636, 1927, -5213, 2624, 909, -1689, 1009, -4359, -2973, -3419, + // q_x16 + 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, + // qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + -6, -6, -6, -6, -408, -408, -408, -408, -6, -6, -6, -6, -408, -408, -408, -408, + // qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956, -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956, + // qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + -6, -6, -6, -6, -1956, -1956, -1956, -1956, -6, -6, -6, -6, -1956, -1956, -1956, -1956, + // qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + 27359, 27359, 27359, 27359, 408, 408, 408, 408, 27359, 27359, 27359, 27359, 408, 408, 408, 408, + // qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + -6, -6, -6, -6, -20856, -20856, -20856, -20856, -6, -6, -6, -6, -20856, -20856, -20856, -20856, + // qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + -408, -408, -408, -408, -21094, -21094, -21094, -21094, -408, -408, -408, -408, -21094, -21094, -21094, -21094, + // qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 + -6, -6, -6, -6, -10093, -10093, -10093, -10093, -6, -6, -6, -6, -10093, -10093, -10093, -10093, + // qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 + -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517, -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517, + // qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + -6, -9508, -20856, -29449, -408, 18345, -21094, -7033, -6, -9508, -20856, -29449, -408, 18345, -21094, -7033, + // qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090, -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090, + // qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072, -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072, + // qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508, 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508, + // qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + -6, -3639, -9508, 25543, -20856, 829, -29449, -17675, -6, -3639, -9508, 25543, -20856, 829, -29449, -17675, + // qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547, -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547, + // qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 + -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683, -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683, + // qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 + -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847, -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847, + // qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, -5619, -3639, -12378, -9508, 15736, 25543, 23007, -20856, -27152, 829, -22209, -29449, -20490, -17675, 22532, + // qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -408, 16724, 18363, 22623, 18345, 5766, 7429, -31369, -21094, 15840, -10001, 19326, -7033, 3407, -4547, 2316, + // qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -27359, 6381, -14847, 8441, -16072, -6924, -26518, -4589, 28517, 12707, -14731, -15864, -12476, 31656, 23056, 24098, + // qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 1956, -31217, -11683, -24269, -28224, -5126, -7228, 20198, 10093, -573, -3925, -14341, 16090, 23781, -28103, -23812, + // qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, 23812, 28103, -23781, -16090, 14341, 3925, 573, -10093, -20198, 7228, 5126, 28224, 24269, 11683, 31217, + // qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -1956, -24098, -23056, -31656, 12476, 15864, 14731, -12707, -28517, 4589, 26518, 6924, 16072, -8441, 14847, -6381, + // qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 27359, -2316, 4547, -3407, 7033, -19326, 10001, -15840, 21094, 31369, -7429, -5766, -18345, -22623, -18363, -16724, + // qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 408, -22532, 17675, 20490, 29449, 22209, -829, 27152, 20856, -23007, -25543, -15736, 9508, 12378, 3639, 5619, + // qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, -17412, -5619, 2017, -3639, 24976, -12378, 24702, -9508, -31558, 15736, 1316, 25543, -31418, 23007, -512, + // qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -20856, -13268, -27152, 22044, 829, 8801, -22209, -12214, -29449, 11141, -20490, -17096, -17675, 32076, 22532, 17571, + // qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -408, 13012, 16724, 4090, 18363, -30546, 22623, 16614, 18345, -17248, 5766, 22666, 7429, -7856, -31369, 31235, + // qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -21094, 28541, 15840, -30351, -10001, -177, 19326, -31887, -7033, 25555, 3407, -31290, -4547, -13579, 2316, -2395, + // qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, 4175, 23812, 7326, 28103, 17352, -23781, -28200, -16090, 11555, 14341, 6978, 3925, -1627, 573, 780, + // qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -10093, 32271, -20198, 7356, 7228, 29364, 5126, 27895, 28224, -609, 24269, 21892, 11683, -7795, 31217, -18845, + // qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -1956, 29407, -24098, -7716, -23056, -719, -31656, -8246, 12476, -26238, 15864, 11842, 14731, 1932, -12707, -11726, + // qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -28517, 4394, 4589, 2066, 26518, -11300, 6924, -24037, 16072, 969, -8441, 14999, 14847, -11854, -6381, -19844, + // qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, -13500, -17412, 32070, -5619, 5120, 2017, 11952, -3639, 1609, 24976, 9374, -12378, -23836, 24702, -8289, + // qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -9508, -22471, -31558, 25482, 15736, -8935, 1316, 32351, 25543, 19661, -31418, 8295, 23007, -25652, -512, -19863, + // qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -20856, 6917, -13268, -28712, -27152, 20899, 22044, 4083, 829, 951, 8801, 29370, -22209, 24641, -12214, 12976, + // qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + -29449, -22215, 11141, -29626, -20490, 30467, -17096, 13158, -17675, -24129, 32076, 7880, 22532, -30053, 17571, -8758, + // qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, 5120, 24976, -8289, 15736, 19661, -512, -28712, 829, 24641, 11141, 13158, 22532, 13024, 4090, -27329, + // qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + 18345, -8807, -7856, -20070, 15840, -1834, -31887, -18875, -4547, 18077, 19844, -23026, 8441, -12653, 11300, 11123, + // qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 28517, 31924, -11842, -14237, 31656, 16809, -29407, -5369, -11683, -16273, -27895, -29827, 20198, 7722, 1627, 9343, + // qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 16090, -15127, -7326, -6716, 5619, -1609, -24702, -25482, -25543, 25652, 13268, -4083, 22209, 22215, 17096, -7880, + // qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, -26292, 17352, 12384, 14341, 61, 780, 23093, 7228, -12336, -609, -7801, 31217, -6747, -7716, 6095, + // qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + 12476, 15511, 1932, 11623, 4589, 6314, -24037, -19320, 14847, 19643, 2395, -21770, -3407, -17394, 177, -23952, + // qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + 21094, -31467, -22666, -1767, -22623, -14329, -13012, 30053, 17675, 29626, 12214, -951, 27152, 19863, 31418, 8935, + // qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 9508, -9374, -2017, 13500, -23812, -29541, 28200, 20173, -3925, -24025, -32271, -19856, -5126, -26286, -21892, -4967, + // qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 + -6, 6716, 4175, -13164, 23812, -26292, 7326, -12098, 28103, 29541, 17352, 15127, -23781, -7289, -28200, 12384, + // qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 + -16090, -29151, 11555, -20173, 14341, -9343, 6978, -22483, 3925, 61, -1627, 23788, 573, 24025, 780, -7722, + // qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 + -10093, -18881, 32271, 23093, -20198, -24330, 7356, 19856, 7228, 29827, 29364, 15517, 5126, -12336, 27895, -4248, + // qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 + 28224, 26286, -609, 16273, 24269, -5729, 21892, -7801, 11683, -30144, -7795, 4967, 31217, 5369, -18845, -8027, + // qinvscaledzeta_x16_4_1 + -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, + // qinvscaledzeta_x16_4_3 + 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, + // qinvscaledzeta_x16_8_1 + -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, + // qinvscaledzeta_x16_8_7 + -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, + // qround32_x16 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + // scaledzeta_x16_4_1 + -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, + // scaledzeta_x16_4_3 + 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, + // scaledzeta_x16_8_1 + 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, + // scaledzeta_x16_8_7 + 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, + } +} ; + +static inline int16x16 add_x16(int16x16 a, int16x16 b) { + return _mm256_add_epi16(a, b); +} + +static inline int16x16 sub_x16(int16x16 a, int16x16 b) { + return _mm256_sub_epi16(a, b); +} + +static inline int16x16 mulmod_scaled_x16(int16x16 x, int16x16 y, int16x16 yqinv, const int16 *qdata) { + int16x16 b = _mm256_mulhi_epi16(x, y); + int16x16 d = _mm256_mullo_epi16(x, yqinv); + int16x16 e = _mm256_mulhi_epi16(d, q_x16); + return sub_x16(b, e); +} + +static inline int16x16 reduce_x16(int16x16 x, const int16 *qdata) { + int16x16 y = _mm256_mulhrs_epi16(x, qround32_x16); + y = _mm256_mullo_epi16(y, q_x16); + return sub_x16(x, y); +} + +// ----- codegen pass 1 +// +// startntt 512 +// startbatch 512 +// // ----- PRECONDITIONS +// physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// // transform size 512 +// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] +// // transforms per batch 1 +// // batch indexing [] +// // total batch size 512 +// +// // modulus x^512-1 pos 0:512 q 7681,10753 bound 512*(5629,5800) +// assertranges ... +// +// // ----- LAYER 1 +// +// // butterfly(0,256,1,256,1,0) +// butterfly 0 256 1 256 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // ----- POSTCONDITIONS AFTER LAYER 1 +// // transform size 512 +// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] +// // transforms per batch 1 +// // batch indexing [] +// // total batch size 512 +// +// // modulus x^256-1 pos 0:256 q 7681,10753 bound 256*(11258,11600) +// assertranges ... +// +// // modulus x^256+1 pos 256:512 q 7681,10753 bound 256*(11258,11600) +// assertranges ... +// +// // ----- LAYER 2 +// +// // reduce_ifreverse(0,64,1) +// reduce_ifreverse 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // reduce_ifreverse(256,320,1) +// reduce_ifreverse 256 320 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // butterfly(0,128,1,128,1,0) +// butterfly 0 128 1 128 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // butterfly(256,384,1,128,4,1) +// butterfly 256 384 1 128 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // ----- POSTCONDITIONS AFTER LAYER 2 +// // transform size 512 +// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] +// // transforms per batch 1 +// // batch indexing [] +// // total batch size 512 +// +// // modulus x^128-1 pos 0:128 q 7681,10753 bound 128*(22516,23200) +// assertranges ... +// +// // modulus x^128+1 pos 128:256 q 7681,10753 bound 128*(22516,23200) +// assertranges ... +// +// // modulus x^128-zeta4 pos 256:384 q 7681,10753 bound 128*(15747,17016) +// assertranges ... +// +// // modulus x^128+zeta4 pos 384:512 q 7681,10753 bound 128*(15747,17016) +// assertranges ... +// +// // ----- LAYER 3 +// +// // reduce_ifforward(64,128,1) +// reduce_ifforward 64 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // butterfly(0,64,1,64,1,0) +// butterfly 0 64 1 64 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // butterfly(128,192,1,64,4,1) +// butterfly 128 192 1 64 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // butterfly(256,320,1,64,8,1) +// butterfly 256 320 1 64 8 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // butterfly(384,448,1,64,8,-1) +// butterfly 384 448 1 64 8 7 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // reduce(0,64,1) +// reduce 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // twist(64,128,1,128,1) +// twist 64 128 1 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // twist(128,192,1,256,1) +// twist 128 192 1 256 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // twist(192,256,1,256,-1) +// twist 192 256 1 256 255 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // twist(256,320,1,512,1) +// twist 256 320 1 512 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // twist(320,384,1,512,5) +// twist 320 384 1 512 5 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // twist(384,448,1,512,-1) +// twist 384 448 1 512 511 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // twist(448,512,1,512,-5) +// twist 448 512 1 512 507 (0, 1, 2, 3, 4, 5, 6, 7, 8) () +// +// // physical_permute(3,6) +// physical_permute (3, 6) (0, 1, 2, 3, 4, 5, 6, 7, 8) () (0, 1, 2, 6, 4, 5, 3, 7, 8) () +// +// // fold(256) +// physical_unmap (0, 1, 2, 6, 4, 5, 3, 7, 8) () +// physical_map (0, 1, 2, 6, 4, 5, 3, 7) (8,) +// +// // fold(128) +// physical_unmap (0, 1, 2, 6, 4, 5, 3, 7) (8,) +// physical_map (0, 1, 2, 6, 4, 5, 3) (7, 8) +// +// // fold(64) +// physical_unmap (0, 1, 2, 6, 4, 5, 3) (7, 8) +// physical_map (0, 1, 2, 6, 4, 5) (3, 7, 8) +// +// // nextbatch() +// stopbatch 512 +// startbatch 512 +// +// // halfbatch() +// physical_unmap (0, 1, 2, 6, 4, 5) (3, 7, 8) +// stopbatch 512 +// doublereps +// startbatch 256 +// physical_map (0, 1, 2, 6, 4, 5) (3, 7) +// +// // halfbatch() +// physical_unmap (0, 1, 2, 6, 4, 5) (3, 7) +// stopbatch 256 +// doublereps +// startbatch 128 +// physical_map (0, 1, 2, 6, 4, 5) (3,) +// +// // ----- POSTCONDITIONS AFTER LAYER 3 +// // transform size 64 +// // transform indexing [0, 1, 2, 6, 4, 5] +// // transforms per batch 2 +// // batch indexing [3] +// // total batch size 128 +// +// // modulus x^64-1 pos 0:64 q 7681,10753 bound 1*(5629,5827) 1*(5629,7613) 1*(5629,7666) 1*(5629,7264) 1*(5629,7639) 1*(5629,7591) 1*(5629,7291) 1*(5629,7204) ... +// assertranges ... +// +// // ----- LAYER 4 +// +// // butterfly(0,32,1,32,1,0) +// butterfly 0 32 1 32 1 0 (0, 1, 2, 6, 4, 5) (3,) +// +// // ----- POSTCONDITIONS AFTER LAYER 4 +// // transform size 64 +// // transform indexing [0, 1, 2, 6, 4, 5] +// // transforms per batch 2 +// // batch indexing [3] +// // total batch size 128 +// +// // modulus x^32-1 pos 0:32 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ... +// assertranges ... +// +// // modulus x^32+1 pos 32:64 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ... +// assertranges ... +// +// // ----- LAYER 5 +// +// // butterfly(0,16,1,16,1,0) +// butterfly 0 16 1 16 1 0 (0, 1, 2, 6, 4, 5) (3,) +// +// // butterfly(32,48,1,16,4,1) +// butterfly 32 48 1 16 4 1 (0, 1, 2, 6, 4, 5) (3,) +// +// // reduce(0,16,1) +// reduce 0 16 1 (0, 1, 2, 6, 4, 5) (3,) +// +// // twist(16,32,1,32,1) +// twist 16 32 1 32 1 (0, 1, 2, 6, 4, 5) (3,) +// +// // twist(32,48,1,64,1) +// twist 32 48 1 64 1 (0, 1, 2, 6, 4, 5) (3,) +// +// // twist(48,64,1,64,-1) +// twist 48 64 1 64 63 (0, 1, 2, 6, 4, 5) (3,) +// +// // physical_permute(0,1,2,5) +// physical_permute (0, 1, 2, 5) (0, 1, 2, 6, 4, 5) (3,) (1, 2, 5, 6, 4, 0) (3,) +// +// // fold(32) +// physical_unmap (1, 2, 5, 6, 4, 0) (3,) +// physical_map (1, 2, 5, 6, 4) (0, 3) +// +// // fold(16) +// physical_unmap (1, 2, 5, 6, 4) (0, 3) +// physical_map (1, 2, 5, 6) (0, 3, 4) +// +// // ----- POSTCONDITIONS AFTER LAYER 5 +// // transform size 16 +// // transform indexing [1, 2, 5, 6] +// // transforms per batch 8 +// // batch indexing [0, 3, 4] +// // total batch size 128 +// +// // modulus x^16-1 pos 0:16 q 7681,10753 bound 1*(5629,5800) 1*(5629,6967) 1*(5629,6418) 1*(5629,7585) 1*(5629,7017) 1*(5629,6328) 1*(5629,7033) 1*(5629,6943) ... +// assertranges ... +// +// // ----- LAYER 6 +// +// // butterfly(0,8,1,8,1,0) +// butterfly 0 8 1 8 1 0 (1, 2, 5, 6) (0, 3, 4) +// +// // physical_permute(1,2,4) +// physical_permute (1, 2, 4) (1, 2, 5, 6) (0, 3, 4) (2, 4, 5, 6) (0, 3, 1) +// +// // nextbatch() +// stopbatch 128 +// startbatch 128 +// +// // ----- POSTCONDITIONS AFTER LAYER 6 +// // transform size 16 +// // transform indexing [2, 4, 5, 6] +// // transforms per batch 8 +// // batch indexing [0, 3, 1] +// // total batch size 128 +// +// // modulus x^8-1 pos 0:8 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555) +// assertranges ... +// +// // modulus x^8+1 pos 8:16 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555) +// assertranges ... +// +// // ----- LAYER 7 +// +// // butterfly(0,4,1,4,1,0) +// butterfly 0 4 1 4 1 0 (2, 4, 5, 6) (0, 3, 1) +// +// // butterfly(8,12,1,4,4,1) +// butterfly 8 12 1 4 4 1 (2, 4, 5, 6) (0, 3, 1) +// +// // reduce(0,4,1) +// reduce 0 4 1 (2, 4, 5, 6) (0, 3, 1) +// +// // twist(4,8,1,8,1) +// twist 4 8 1 8 1 (2, 4, 5, 6) (0, 3, 1) +// +// // twist(8,12,1,16,1) +// twist 8 12 1 16 1 (2, 4, 5, 6) (0, 3, 1) +// +// // twist(12,16,1,16,-1) +// twist 12 16 1 16 15 (2, 4, 5, 6) (0, 3, 1) +// +// // physical_permute(2,6) +// physical_permute (2, 6) (2, 4, 5, 6) (0, 3, 1) (6, 4, 5, 2) (0, 3, 1) +// +// // fold(8) +// physical_unmap (6, 4, 5, 2) (0, 3, 1) +// physical_map (6, 4, 5) (0, 1, 2, 3) +// +// // fold(4) +// physical_unmap (6, 4, 5) (0, 1, 2, 3) +// physical_map (6, 4) (0, 1, 2, 3, 5) +// +// // ----- POSTCONDITIONS AFTER LAYER 7 +// // transform size 4 +// // transform indexing [6, 4] +// // transforms per batch 32 +// // batch indexing [0, 1, 2, 3, 5] +// // total batch size 128 +// +// // modulus x^4-1 pos 0:4 q 7681,10753 bound 1*(5629,5800) 1*(5629,6938) 1*(5629,6521) 1*(5629,7157) +// assertranges ... +// +// // ----- LAYER 8 +// +// // butterfly(0,2,1,2,1,0) +// butterfly 0 2 1 2 1 0 (6, 4) (0, 1, 2, 3, 5) +// +// // ----- POSTCONDITIONS AFTER LAYER 8 +// // transform size 4 +// // transform indexing [6, 4] +// // transforms per batch 32 +// // batch indexing [0, 1, 2, 3, 5] +// // total batch size 128 +// +// // modulus x^2-1 pos 0:2 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095) +// assertranges ... +// +// // modulus x^2+1 pos 2:4 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095) +// assertranges ... +// +// // ----- LAYER 9 +// +// // butterfly(0,1,1,1,1,0) +// butterfly 0 1 1 1 1 0 (6, 4) (0, 1, 2, 3, 5) +// +// // butterfly(2,3,1,1,4,1) +// butterfly 2 3 1 1 4 1 (6, 4) (0, 1, 2, 3, 5) +// +// // ----- POSTCONDITIONS AFTER LAYER 9 +// // transform size 4 +// // transform indexing [6, 4] +// // transforms per batch 32 +// // batch indexing [0, 1, 2, 3, 5] +// // total batch size 128 +// +// // modulus x^1-1 pos 0:1 q 7681,10753 bound 1*(22516,26416) +// assertranges ... +// +// // modulus x^1+1 pos 1:2 q 7681,10753 bound 1*(22516,26416) +// assertranges ... +// +// // modulus x^1-zeta4 pos 2:3 q 7681,10753 bound 1*(15747,17745) +// assertranges ... +// +// // modulus x^1+zeta4 pos 3:4 q 7681,10753 bound 1*(15747,17745) +// assertranges ... +// stopbatch 128 +// physical_unmap (6, 4) (0, 1, 2, 3, 5) +// stopntt 512 + +// ----- codegen pass 2 +// +// startntt 512 +// startbatch 512 +// vector_butterfly 0 256 1 0 +// vector_butterfly 128 384 1 0 +// vector_butterfly 64 320 1 0 +// vector_butterfly 192 448 1 0 +// vector_reduce_ifreverse 0 +// vector_reduce_ifreverse 256 +// vector_butterfly 0 128 1 0 +// vector_butterfly 64 192 1 0 +// vector_butterfly 256 384 4 1 +// vector_butterfly 320 448 4 1 +// vector_reduce_ifforward 64 +// vector_butterfly 0 64 1 0 +// vector_butterfly 128 192 4 1 +// vector_butterfly 256 320 8 1 +// vector_butterfly 384 448 8 7 +// vector_reduce 0 +// vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// stopbatch 512 +// startbatch 512 +// vector_butterfly 16 272 1 0 +// vector_butterfly 144 400 1 0 +// vector_butterfly 80 336 1 0 +// vector_butterfly 208 464 1 0 +// vector_reduce_ifreverse 16 +// vector_reduce_ifreverse 272 +// vector_butterfly 16 144 1 0 +// vector_butterfly 80 208 1 0 +// vector_butterfly 272 400 4 1 +// vector_butterfly 336 464 4 1 +// vector_reduce_ifforward 80 +// vector_butterfly 16 80 1 0 +// vector_butterfly 144 208 4 1 +// vector_butterfly 272 336 8 1 +// vector_butterfly 400 464 8 7 +// vector_reduce 16 +// vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// stopbatch 512 +// startbatch 512 +// vector_butterfly 32 288 1 0 +// vector_butterfly 160 416 1 0 +// vector_butterfly 96 352 1 0 +// vector_butterfly 224 480 1 0 +// vector_reduce_ifreverse 32 +// vector_reduce_ifreverse 288 +// vector_butterfly 32 160 1 0 +// vector_butterfly 96 224 1 0 +// vector_butterfly 288 416 4 1 +// vector_butterfly 352 480 4 1 +// vector_reduce_ifforward 96 +// vector_butterfly 32 96 1 0 +// vector_butterfly 160 224 4 1 +// vector_butterfly 288 352 8 1 +// vector_butterfly 416 480 8 7 +// vector_reduce 32 +// vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 +// vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// stopbatch 512 +// startbatch 512 +// vector_butterfly 48 304 1 0 +// vector_butterfly 176 432 1 0 +// vector_butterfly 112 368 1 0 +// vector_butterfly 240 496 1 0 +// vector_reduce_ifreverse 48 +// vector_reduce_ifreverse 304 +// vector_butterfly 48 176 1 0 +// vector_butterfly 112 240 1 0 +// vector_butterfly 304 432 4 1 +// vector_butterfly 368 496 4 1 +// vector_reduce_ifforward 112 +// vector_butterfly 48 112 1 0 +// vector_butterfly 176 240 4 1 +// vector_butterfly 304 368 8 1 +// vector_butterfly 432 496 8 7 +// vector_reduce 48 +// vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +// vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +// vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +// vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +// vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +// vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +// vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +// vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi +// stopbatch 512 +// doublereps +// doublereps +// startbatch 128 +// vector_butterfly 0 32 1 0 +// vector_butterfly 64 96 1 0 +// vector_butterfly 16 48 1 0 +// vector_butterfly 80 112 1 0 +// vector_butterfly 0 16 1 0 +// vector_butterfly 64 80 1 0 +// vector_butterfly 32 48 4 1 +// vector_butterfly 96 112 4 1 +// vector_reduce 0 +// vector_reduce 64 +// vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 +// vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 +// vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 +// vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 +// vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 +// vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 +// vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 +// vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 +// vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 +// vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 +// vector_butterfly 0 64 1 0 +// vector_butterfly 32 96 1 0 +// vector_butterfly 16 80 1 0 +// vector_butterfly 48 112 1 0 +// vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 +// vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 +// vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 +// vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 +// stopbatch 128 +// startbatch 128 +// vector_butterfly 0 32 1 0 +// vector_butterfly 16 48 1 0 +// vector_butterfly 64 96 4 1 +// vector_butterfly 80 112 4 1 +// vector_reduce 0 +// vector_reduce 16 +// vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 +// vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 +// vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 +// vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 +// vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 +// vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 +// vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 +// vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 +// vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 +// vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 +// vector_butterfly 0 16 1 0 +// vector_butterfly 64 80 1 0 +// vector_butterfly 32 48 1 0 +// vector_butterfly 96 112 1 0 +// vector_butterfly 0 64 1 0 +// vector_butterfly 32 96 1 0 +// vector_butterfly 16 80 4 1 +// vector_butterfly 48 112 4 1 +// stopbatch 128 +// stopntt 512 +// startntt 512 + +static void ntt512(int16 *f, int reps, const int16 *qdata) { + // startbatch 512 + for (long long r = 0; r < reps; ++r) { + // vector_butterfly 0 256 1 0 + int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); + int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256)); + int16x16 b0 = add_x16(a0, a16); + int16x16 b16 = sub_x16(a0, a16); + // vector_butterfly 128 384 1 0 + int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128)); + int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384)); + int16x16 b8 = add_x16(a8, a24); + int16x16 b24 = sub_x16(a8, a24); + // vector_butterfly 64 320 1 0 + int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); + int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320)); + int16x16 b4 = add_x16(a4, a20); + int16x16 b20 = sub_x16(a4, a20); + // vector_butterfly 192 448 1 0 + int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192)); + int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448)); + int16x16 b12 = add_x16(a12, a28); + int16x16 b28 = sub_x16(a12, a28); + // vector_reduce_ifreverse 0 + // vector_reduce_ifreverse 256 + // vector_butterfly 0 128 1 0 + int16x16 c0 = add_x16(b0, b8); + int16x16 c8 = sub_x16(b0, b8); + // vector_butterfly 64 192 1 0 + int16x16 c4 = add_x16(b4, b12); + int16x16 c12 = sub_x16(b4, b12); + // vector_butterfly 256 384 4 1 + b24 = mulmod_scaled_x16(b24, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c16 = add_x16(b16, b24); + int16x16 c24 = sub_x16(b16, b24); + // vector_butterfly 320 448 4 1 + b28 = mulmod_scaled_x16(b28, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c20 = add_x16(b20, b28); + int16x16 c28 = sub_x16(b20, b28); + // vector_reduce_ifforward 64 + c4 = reduce_x16(c4, qdata); + // vector_butterfly 0 64 1 0 + int16x16 d0 = add_x16(c0, c4); + int16x16 d4 = sub_x16(c0, c4); + // vector_butterfly 128 192 4 1 + c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 d8 = add_x16(c8, c12); + int16x16 d12 = sub_x16(c8, c12); + // vector_butterfly 256 320 8 1 + c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + int16x16 d16 = add_x16(c16, c20); + int16x16 d20 = sub_x16(c16, c20); + // vector_butterfly 384 448 8 7 + c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + int16x16 d24 = add_x16(c24, c28); + int16x16 d28 = sub_x16(c24, c28); + // vector_reduce 0 + d0 = reduce_x16(d0, qdata); + // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + d4 = mulmod_scaled_x16(d4, precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + d8 = mulmod_scaled_x16(d8, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + d12 = mulmod_scaled_x16(d12, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + d16 = mulmod_scaled_x16(d16, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + d20 = mulmod_scaled_x16(d20, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + d24 = mulmod_scaled_x16(d24, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + d28 = mulmod_scaled_x16(d28, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e0 = _mm256_permute2x128_si256_lo(d0, d4); + int16x16 e4 = _mm256_permute2x128_si256_hi(d0, d4); + // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e8 = _mm256_permute2x128_si256_lo(d8, d12); + int16x16 e12 = _mm256_permute2x128_si256_hi(d8, d12); + // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e16 = _mm256_permute2x128_si256_lo(d16, d20); + int16x16 e20 = _mm256_permute2x128_si256_hi(d16, d20); + // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e24 = _mm256_permute2x128_si256_lo(d24, d28); + int16x16 e28 = _mm256_permute2x128_si256_hi(d24, d28); + // stopbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 0), e0); + _mm256_storeu_si256((int16x16 *) (f + 64), e4); + _mm256_storeu_si256((int16x16 *) (f + 128), e8); + _mm256_storeu_si256((int16x16 *) (f + 192), e12); + _mm256_storeu_si256((int16x16 *) (f + 256), e16); + _mm256_storeu_si256((int16x16 *) (f + 320), e20); + _mm256_storeu_si256((int16x16 *) (f + 384), e24); + _mm256_storeu_si256((int16x16 *) (f + 448), e28); + f += 512; + } + f -= 512 * reps; + // startbatch 512 + for (long long r = 0; r < reps; ++r) { + // vector_butterfly 16 272 1 0 + int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); + int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272)); + int16x16 b1 = add_x16(a1, a17); + int16x16 b17 = sub_x16(a1, a17); + // vector_butterfly 144 400 1 0 + int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144)); + int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400)); + int16x16 b9 = add_x16(a9, a25); + int16x16 b25 = sub_x16(a9, a25); + // vector_butterfly 80 336 1 0 + int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); + int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336)); + int16x16 b5 = add_x16(a5, a21); + int16x16 b21 = sub_x16(a5, a21); + // vector_butterfly 208 464 1 0 + int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208)); + int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464)); + int16x16 b13 = add_x16(a13, a29); + int16x16 b29 = sub_x16(a13, a29); + // vector_reduce_ifreverse 16 + // vector_reduce_ifreverse 272 + // vector_butterfly 16 144 1 0 + int16x16 c1 = add_x16(b1, b9); + int16x16 c9 = sub_x16(b1, b9); + // vector_butterfly 80 208 1 0 + int16x16 c5 = add_x16(b5, b13); + int16x16 c13 = sub_x16(b5, b13); + // vector_butterfly 272 400 4 1 + b25 = mulmod_scaled_x16(b25, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c17 = add_x16(b17, b25); + int16x16 c25 = sub_x16(b17, b25); + // vector_butterfly 336 464 4 1 + b29 = mulmod_scaled_x16(b29, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c21 = add_x16(b21, b29); + int16x16 c29 = sub_x16(b21, b29); + // vector_reduce_ifforward 80 + c5 = reduce_x16(c5, qdata); + // vector_butterfly 16 80 1 0 + int16x16 d1 = add_x16(c1, c5); + int16x16 d5 = sub_x16(c1, c5); + // vector_butterfly 144 208 4 1 + c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 d9 = add_x16(c9, c13); + int16x16 d13 = sub_x16(c9, c13); + // vector_butterfly 272 336 8 1 + c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + int16x16 d17 = add_x16(c17, c21); + int16x16 d21 = sub_x16(c17, c21); + // vector_butterfly 400 464 8 7 + c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + int16x16 d25 = add_x16(c25, c29); + int16x16 d29 = sub_x16(c25, c29); + // vector_reduce 16 + d1 = reduce_x16(d1, qdata); + // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + d5 = mulmod_scaled_x16(d5, precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + d9 = mulmod_scaled_x16(d9, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + d13 = mulmod_scaled_x16(d13, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + d17 = mulmod_scaled_x16(d17, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + d21 = mulmod_scaled_x16(d21, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + d25 = mulmod_scaled_x16(d25, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + d29 = mulmod_scaled_x16(d29, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e1 = _mm256_permute2x128_si256_lo(d1, d5); + int16x16 e5 = _mm256_permute2x128_si256_hi(d1, d5); + // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e9 = _mm256_permute2x128_si256_lo(d9, d13); + int16x16 e13 = _mm256_permute2x128_si256_hi(d9, d13); + // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e17 = _mm256_permute2x128_si256_lo(d17, d21); + int16x16 e21 = _mm256_permute2x128_si256_hi(d17, d21); + // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e25 = _mm256_permute2x128_si256_lo(d25, d29); + int16x16 e29 = _mm256_permute2x128_si256_hi(d25, d29); + // stopbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 16), e1); + _mm256_storeu_si256((int16x16 *) (f + 80), e5); + _mm256_storeu_si256((int16x16 *) (f + 144), e9); + _mm256_storeu_si256((int16x16 *) (f + 208), e13); + _mm256_storeu_si256((int16x16 *) (f + 272), e17); + _mm256_storeu_si256((int16x16 *) (f + 336), e21); + _mm256_storeu_si256((int16x16 *) (f + 400), e25); + _mm256_storeu_si256((int16x16 *) (f + 464), e29); + f += 512; + } + f -= 512 * reps; + // startbatch 512 + for (long long r = 0; r < reps; ++r) { + // vector_butterfly 32 288 1 0 + int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); + int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288)); + int16x16 b2 = add_x16(a2, a18); + int16x16 b18 = sub_x16(a2, a18); + // vector_butterfly 160 416 1 0 + int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160)); + int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416)); + int16x16 b10 = add_x16(a10, a26); + int16x16 b26 = sub_x16(a10, a26); + // vector_butterfly 96 352 1 0 + int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); + int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352)); + int16x16 b6 = add_x16(a6, a22); + int16x16 b22 = sub_x16(a6, a22); + // vector_butterfly 224 480 1 0 + int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224)); + int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480)); + int16x16 b14 = add_x16(a14, a30); + int16x16 b30 = sub_x16(a14, a30); + // vector_reduce_ifreverse 32 + // vector_reduce_ifreverse 288 + // vector_butterfly 32 160 1 0 + int16x16 c2 = add_x16(b2, b10); + int16x16 c10 = sub_x16(b2, b10); + // vector_butterfly 96 224 1 0 + int16x16 c6 = add_x16(b6, b14); + int16x16 c14 = sub_x16(b6, b14); + // vector_butterfly 288 416 4 1 + b26 = mulmod_scaled_x16(b26, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c18 = add_x16(b18, b26); + int16x16 c26 = sub_x16(b18, b26); + // vector_butterfly 352 480 4 1 + b30 = mulmod_scaled_x16(b30, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c22 = add_x16(b22, b30); + int16x16 c30 = sub_x16(b22, b30); + // vector_reduce_ifforward 96 + c6 = reduce_x16(c6, qdata); + // vector_butterfly 32 96 1 0 + int16x16 d2 = add_x16(c2, c6); + int16x16 d6 = sub_x16(c2, c6); + // vector_butterfly 160 224 4 1 + c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 d10 = add_x16(c10, c14); + int16x16 d14 = sub_x16(c10, c14); + // vector_butterfly 288 352 8 1 + c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + int16x16 d18 = add_x16(c18, c22); + int16x16 d22 = sub_x16(c18, c22); + // vector_butterfly 416 480 8 7 + c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + int16x16 d26 = add_x16(c26, c30); + int16x16 d30 = sub_x16(c26, c30); + // vector_reduce 32 + d2 = reduce_x16(d2, qdata); + // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + d6 = mulmod_scaled_x16(d6, precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + d10 = mulmod_scaled_x16(d10, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + d14 = mulmod_scaled_x16(d14, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + d18 = mulmod_scaled_x16(d18, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + d22 = mulmod_scaled_x16(d22, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + d26 = mulmod_scaled_x16(d26, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + d30 = mulmod_scaled_x16(d30, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e2 = _mm256_permute2x128_si256_lo(d2, d6); + int16x16 e6 = _mm256_permute2x128_si256_hi(d2, d6); + // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e10 = _mm256_permute2x128_si256_lo(d10, d14); + int16x16 e14 = _mm256_permute2x128_si256_hi(d10, d14); + // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e18 = _mm256_permute2x128_si256_lo(d18, d22); + int16x16 e22 = _mm256_permute2x128_si256_hi(d18, d22); + // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e26 = _mm256_permute2x128_si256_lo(d26, d30); + int16x16 e30 = _mm256_permute2x128_si256_hi(d26, d30); + // stopbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 32), e2); + _mm256_storeu_si256((int16x16 *) (f + 96), e6); + _mm256_storeu_si256((int16x16 *) (f + 160), e10); + _mm256_storeu_si256((int16x16 *) (f + 224), e14); + _mm256_storeu_si256((int16x16 *) (f + 288), e18); + _mm256_storeu_si256((int16x16 *) (f + 352), e22); + _mm256_storeu_si256((int16x16 *) (f + 416), e26); + _mm256_storeu_si256((int16x16 *) (f + 480), e30); + f += 512; + } + f -= 512 * reps; + // startbatch 512 + for (long long r = 0; r < reps; ++r) { + // vector_butterfly 48 304 1 0 + int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); + int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304)); + int16x16 b3 = add_x16(a3, a19); + int16x16 b19 = sub_x16(a3, a19); + // vector_butterfly 176 432 1 0 + int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176)); + int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432)); + int16x16 b11 = add_x16(a11, a27); + int16x16 b27 = sub_x16(a11, a27); + // vector_butterfly 112 368 1 0 + int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); + int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368)); + int16x16 b7 = add_x16(a7, a23); + int16x16 b23 = sub_x16(a7, a23); + // vector_butterfly 240 496 1 0 + int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240)); + int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496)); + int16x16 b15 = add_x16(a15, a31); + int16x16 b31 = sub_x16(a15, a31); + // vector_reduce_ifreverse 48 + // vector_reduce_ifreverse 304 + // vector_butterfly 48 176 1 0 + int16x16 c3 = add_x16(b3, b11); + int16x16 c11 = sub_x16(b3, b11); + // vector_butterfly 112 240 1 0 + int16x16 c7 = add_x16(b7, b15); + int16x16 c15 = sub_x16(b7, b15); + // vector_butterfly 304 432 4 1 + b27 = mulmod_scaled_x16(b27, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c19 = add_x16(b19, b27); + int16x16 c27 = sub_x16(b19, b27); + // vector_butterfly 368 496 4 1 + b31 = mulmod_scaled_x16(b31, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c23 = add_x16(b23, b31); + int16x16 c31 = sub_x16(b23, b31); + // vector_reduce_ifforward 112 + c7 = reduce_x16(c7, qdata); + // vector_butterfly 48 112 1 0 + int16x16 d3 = add_x16(c3, c7); + int16x16 d7 = sub_x16(c3, c7); + // vector_butterfly 176 240 4 1 + c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 d11 = add_x16(c11, c15); + int16x16 d15 = sub_x16(c11, c15); + // vector_butterfly 304 368 8 1 + c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + int16x16 d19 = add_x16(c19, c23); + int16x16 d23 = sub_x16(c19, c23); + // vector_butterfly 432 496 8 7 + c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + int16x16 d27 = add_x16(c27, c31); + int16x16 d31 = sub_x16(c27, c31); + // vector_reduce 48 + d3 = reduce_x16(d3, qdata); + // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + d7 = mulmod_scaled_x16(d7, precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + d11 = mulmod_scaled_x16(d11, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + d15 = mulmod_scaled_x16(d15, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + d19 = mulmod_scaled_x16(d19, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + d23 = mulmod_scaled_x16(d23, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + d27 = mulmod_scaled_x16(d27, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + d31 = mulmod_scaled_x16(d31, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e3 = _mm256_permute2x128_si256_lo(d3, d7); + int16x16 e7 = _mm256_permute2x128_si256_hi(d3, d7); + // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e11 = _mm256_permute2x128_si256_lo(d11, d15); + int16x16 e15 = _mm256_permute2x128_si256_hi(d11, d15); + // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e19 = _mm256_permute2x128_si256_lo(d19, d23); + int16x16 e23 = _mm256_permute2x128_si256_hi(d19, d23); + // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 e27 = _mm256_permute2x128_si256_lo(d27, d31); + int16x16 e31 = _mm256_permute2x128_si256_hi(d27, d31); + // stopbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 48), e3); + _mm256_storeu_si256((int16x16 *) (f + 112), e7); + _mm256_storeu_si256((int16x16 *) (f + 176), e11); + _mm256_storeu_si256((int16x16 *) (f + 240), e15); + _mm256_storeu_si256((int16x16 *) (f + 304), e19); + _mm256_storeu_si256((int16x16 *) (f + 368), e23); + _mm256_storeu_si256((int16x16 *) (f + 432), e27); + _mm256_storeu_si256((int16x16 *) (f + 496), e31); + f += 512; + } + f -= 512 * reps; + // doublereps + reps *= 2; + // doublereps + reps *= 2; + // startbatch 128 + for (long long r = 0; r < reps; ++r) { + // vector_butterfly 0 32 1 0 + int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); + int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); + int16x16 b0 = add_x16(a0, a2); + int16x16 b2 = sub_x16(a0, a2); + // vector_butterfly 64 96 1 0 + int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); + int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); + int16x16 b4 = add_x16(a4, a6); + int16x16 b6 = sub_x16(a4, a6); + // vector_butterfly 16 48 1 0 + int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); + int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); + int16x16 b1 = add_x16(a1, a3); + int16x16 b3 = sub_x16(a1, a3); + // vector_butterfly 80 112 1 0 + int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); + int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); + int16x16 b5 = add_x16(a5, a7); + int16x16 b7 = sub_x16(a5, a7); + // vector_butterfly 0 16 1 0 + int16x16 c0 = add_x16(b0, b1); + int16x16 c1 = sub_x16(b0, b1); + // vector_butterfly 64 80 1 0 + int16x16 c4 = add_x16(b4, b5); + int16x16 c5 = sub_x16(b4, b5); + // vector_butterfly 32 48 4 1 + b3 = mulmod_scaled_x16(b3, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c2 = add_x16(b2, b3); + int16x16 c3 = sub_x16(b2, b3); + // vector_butterfly 96 112 4 1 + b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 c6 = add_x16(b6, b7); + int16x16 c7 = sub_x16(b6, b7); + // vector_reduce 0 + c0 = reduce_x16(c0, qdata); + // vector_reduce 64 + c4 = reduce_x16(c4, qdata); + // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + c1 = mulmod_scaled_x16(c1, precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); + // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 + c5 = mulmod_scaled_x16(c5, precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); + // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + c2 = mulmod_scaled_x16(c2, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); + // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 + c6 = mulmod_scaled_x16(c6, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); + // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + c3 = mulmod_scaled_x16(c3, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); + // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 + c7 = mulmod_scaled_x16(c7, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); + // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 d0 = _mm256_unpacklo_epi16(c0, c2); + int16x16 d2 = _mm256_unpackhi_epi16(c0, c2); + // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 d1 = _mm256_unpacklo_epi16(c1, c3); + int16x16 d3 = _mm256_unpackhi_epi16(c1, c3); + // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 d4 = _mm256_unpacklo_epi16(c4, c6); + int16x16 d6 = _mm256_unpackhi_epi16(c4, c6); + // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 d5 = _mm256_unpacklo_epi16(c5, c7); + int16x16 d7 = _mm256_unpackhi_epi16(c5, c7); + // vector_butterfly 0 64 1 0 + int16x16 e0 = add_x16(d0, d4); + int16x16 e4 = sub_x16(d0, d4); + // vector_butterfly 32 96 1 0 + int16x16 e2 = add_x16(d2, d6); + int16x16 e6 = sub_x16(d2, d6); + // vector_butterfly 16 80 1 0 + int16x16 e1 = add_x16(d1, d5); + int16x16 e5 = sub_x16(d1, d5); + // vector_butterfly 48 112 1 0 + int16x16 e3 = add_x16(d3, d7); + int16x16 e7 = sub_x16(d3, d7); + // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 f0 = _mm256_unpacklo_epi32(e0, e1); + int16x16 f1 = _mm256_unpackhi_epi32(e0, e1); + // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 f2 = _mm256_unpacklo_epi32(e2, e3); + int16x16 f3 = _mm256_unpackhi_epi32(e2, e3); + // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 f4 = _mm256_unpacklo_epi32(e4, e5); + int16x16 f5 = _mm256_unpackhi_epi32(e4, e5); + // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 f6 = _mm256_unpacklo_epi32(e6, e7); + int16x16 f7 = _mm256_unpackhi_epi32(e6, e7); + // stopbatch 128 + _mm256_storeu_si256((int16x16 *) (f + 0), f0); + _mm256_storeu_si256((int16x16 *) (f + 16), f1); + _mm256_storeu_si256((int16x16 *) (f + 32), f2); + _mm256_storeu_si256((int16x16 *) (f + 48), f3); + _mm256_storeu_si256((int16x16 *) (f + 64), f4); + _mm256_storeu_si256((int16x16 *) (f + 80), f5); + _mm256_storeu_si256((int16x16 *) (f + 96), f6); + _mm256_storeu_si256((int16x16 *) (f + 112), f7); + f += 128; + } + f -= 128 * reps; + // startbatch 128 + for (long long r = 0; r < reps; ++r) { + // vector_butterfly 0 32 1 0 + int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); + int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); + int16x16 b0 = add_x16(a0, a2); + int16x16 b2 = sub_x16(a0, a2); + // vector_butterfly 16 48 1 0 + int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); + int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); + int16x16 b1 = add_x16(a1, a3); + int16x16 b3 = sub_x16(a1, a3); + // vector_butterfly 64 96 4 1 + int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); + int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); + a6 = mulmod_scaled_x16(a6, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 b4 = add_x16(a4, a6); + int16x16 b6 = sub_x16(a4, a6); + // vector_butterfly 80 112 4 1 + int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); + int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); + a7 = mulmod_scaled_x16(a7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 b5 = add_x16(a5, a7); + int16x16 b7 = sub_x16(a5, a7); + // vector_reduce 0 + b0 = reduce_x16(b0, qdata); + // vector_reduce 16 + b1 = reduce_x16(b1, qdata); + // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 + b2 = mulmod_scaled_x16(b2, precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); + // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 + b3 = mulmod_scaled_x16(b3, precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); + // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 + b4 = mulmod_scaled_x16(b4, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); + // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 + b5 = mulmod_scaled_x16(b5, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); + // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 + b6 = mulmod_scaled_x16(b6, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); + // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 + b7 = mulmod_scaled_x16(b7, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); + // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 c0 = _mm256_unpacklo_epi64(b0, b4); + int16x16 c4 = _mm256_unpackhi_epi64(b0, b4); + // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 c1 = _mm256_unpacklo_epi64(b1, b5); + int16x16 c5 = _mm256_unpackhi_epi64(b1, b5); + // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 c2 = _mm256_unpacklo_epi64(b2, b6); + int16x16 c6 = _mm256_unpackhi_epi64(b2, b6); + // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 c3 = _mm256_unpacklo_epi64(b3, b7); + int16x16 c7 = _mm256_unpackhi_epi64(b3, b7); + // vector_butterfly 0 16 1 0 + int16x16 d0 = add_x16(c0, c1); + int16x16 d1 = sub_x16(c0, c1); + // vector_butterfly 64 80 1 0 + int16x16 d4 = add_x16(c4, c5); + int16x16 d5 = sub_x16(c4, c5); + // vector_butterfly 32 48 1 0 + int16x16 d2 = add_x16(c2, c3); + int16x16 d3 = sub_x16(c2, c3); + // vector_butterfly 96 112 1 0 + int16x16 d6 = add_x16(c6, c7); + int16x16 d7 = sub_x16(c6, c7); + // vector_butterfly 0 64 1 0 + int16x16 e0 = add_x16(d0, d4); + int16x16 e4 = sub_x16(d0, d4); + // vector_butterfly 32 96 1 0 + int16x16 e2 = add_x16(d2, d6); + int16x16 e6 = sub_x16(d2, d6); + // vector_butterfly 16 80 4 1 + d5 = mulmod_scaled_x16(d5, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 e1 = add_x16(d1, d5); + int16x16 e5 = sub_x16(d1, d5); + // vector_butterfly 48 112 4 1 + d7 = mulmod_scaled_x16(d7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); + int16x16 e3 = add_x16(d3, d7); + int16x16 e7 = sub_x16(d3, d7); + // stopbatch 128 + _mm256_storeu_si256((int16x16 *) (f + 0), e0); + _mm256_storeu_si256((int16x16 *) (f + 16), e1); + _mm256_storeu_si256((int16x16 *) (f + 32), e2); + _mm256_storeu_si256((int16x16 *) (f + 48), e3); + _mm256_storeu_si256((int16x16 *) (f + 64), e4); + _mm256_storeu_si256((int16x16 *) (f + 80), e5); + _mm256_storeu_si256((int16x16 *) (f + 96), e6); + _mm256_storeu_si256((int16x16 *) (f + 112), e7); + f += 128; + } + // f -= 128*reps; + // stopntt 512 +} + +void PQCLEAN_SNTRUP761_AVX2_ntt512_7681(int16 *f, int reps) { + ntt512(f, reps, qdata_7681.data); +} + +void PQCLEAN_SNTRUP761_AVX2_ntt512_10753(int16 *f, int reps) { + ntt512(f, reps, qdata_10753.data); +} +// inv stopntt 512 + +static void invntt512(int16 *f, int reps, const int16 *qdata) { + reps *= 4; + // inv stopbatch 128 + for (long long r = 0; r < reps; ++r) { + // inv vector_butterfly 48 112 4 1 + int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); + int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); + int16x16 b3 = add_x16(a3, a7); + int16x16 b7 = sub_x16(a3, a7); + b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 16 80 4 1 + int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); + int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); + int16x16 b1 = add_x16(a1, a5); + int16x16 b5 = sub_x16(a1, a5); + b5 = mulmod_scaled_x16(b5, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 32 96 1 0 + int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); + int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); + int16x16 b2 = add_x16(a2, a6); + int16x16 b6 = sub_x16(a2, a6); + // inv vector_butterfly 0 64 1 0 + int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); + int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); + int16x16 b0 = add_x16(a0, a4); + int16x16 b4 = sub_x16(a0, a4); + // inv vector_butterfly 96 112 1 0 + int16x16 c6 = add_x16(b6, b7); + int16x16 c7 = sub_x16(b6, b7); + // inv vector_butterfly 32 48 1 0 + int16x16 c2 = add_x16(b2, b3); + int16x16 c3 = sub_x16(b2, b3); + // inv vector_butterfly 64 80 1 0 + int16x16 c4 = add_x16(b4, b5); + int16x16 c5 = sub_x16(b4, b5); + // inv vector_butterfly 0 16 1 0 + int16x16 c0 = add_x16(b0, b1); + int16x16 c1 = sub_x16(b0, b1); + // inv vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 d3 = _mm256_unpacklo_epi64(c3, c7); + int16x16 d7 = _mm256_unpackhi_epi64(c3, c7); + // inv vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 d2 = _mm256_unpacklo_epi64(c2, c6); + int16x16 d6 = _mm256_unpackhi_epi64(c2, c6); + // inv vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 d1 = _mm256_unpacklo_epi64(c1, c5); + int16x16 d5 = _mm256_unpackhi_epi64(c1, c5); + // inv vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 + int16x16 d0 = _mm256_unpacklo_epi64(c0, c4); + int16x16 d4 = _mm256_unpackhi_epi64(c0, c4); + // inv vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 + d7 = mulmod_scaled_x16(d7, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); + // inv vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 + d6 = mulmod_scaled_x16(d6, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); + // inv vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 + d5 = mulmod_scaled_x16(d5, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); + // inv vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 + d4 = mulmod_scaled_x16(d4, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); + // inv vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 + d3 = mulmod_scaled_x16(d3, precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); + // inv vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 + d2 = mulmod_scaled_x16(d2, precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); + // inv vector_reduce 16 + d1 = reduce_x16(d1, qdata); + // inv vector_reduce 0 + d0 = reduce_x16(d0, qdata); + // inv vector_butterfly 80 112 4 1 + int16x16 e5 = add_x16(d5, d7); + int16x16 e7 = sub_x16(d5, d7); + e7 = mulmod_scaled_x16(e7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 64 96 4 1 + int16x16 e4 = add_x16(d4, d6); + int16x16 e6 = sub_x16(d4, d6); + e6 = mulmod_scaled_x16(e6, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 16 48 1 0 + int16x16 e1 = add_x16(d1, d3); + int16x16 e3 = sub_x16(d1, d3); + // inv vector_butterfly 0 32 1 0 + int16x16 e0 = add_x16(d0, d2); + int16x16 e2 = sub_x16(d0, d2); + // inv startbatch 128 + _mm256_storeu_si256((int16x16 *) (f + 0), e0); + _mm256_storeu_si256((int16x16 *) (f + 16), e1); + _mm256_storeu_si256((int16x16 *) (f + 32), e2); + _mm256_storeu_si256((int16x16 *) (f + 48), e3); + _mm256_storeu_si256((int16x16 *) (f + 64), e4); + _mm256_storeu_si256((int16x16 *) (f + 80), e5); + _mm256_storeu_si256((int16x16 *) (f + 96), e6); + _mm256_storeu_si256((int16x16 *) (f + 112), e7); + f += 128; + } + f -= 128 * reps; + // inv stopbatch 128 + for (long long r = 0; r < reps; ++r) { + // inv vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); + int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); + int16x16 b6 = _mm256_unpacklo_epi32(a6, a7); + int16x16 b7 = _mm256_unpackhi_epi32(a6, a7); + int16x16 c6 = _mm256_unpacklo_epi32(b6, b7); + int16x16 c7 = _mm256_unpackhi_epi32(b6, b7); + // inv vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); + int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); + int16x16 b4 = _mm256_unpacklo_epi32(a4, a5); + int16x16 b5 = _mm256_unpackhi_epi32(a4, a5); + int16x16 c4 = _mm256_unpacklo_epi32(b4, b5); + int16x16 c5 = _mm256_unpackhi_epi32(b4, b5); + // inv vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); + int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); + int16x16 b2 = _mm256_unpacklo_epi32(a2, a3); + int16x16 b3 = _mm256_unpackhi_epi32(a2, a3); + int16x16 c2 = _mm256_unpacklo_epi32(b2, b3); + int16x16 c3 = _mm256_unpackhi_epi32(b2, b3); + // inv vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 + int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); + int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); + int16x16 b0 = _mm256_unpacklo_epi32(a0, a1); + int16x16 b1 = _mm256_unpackhi_epi32(a0, a1); + int16x16 c0 = _mm256_unpacklo_epi32(b0, b1); + int16x16 c1 = _mm256_unpackhi_epi32(b0, b1); + // inv vector_butterfly 48 112 1 0 + int16x16 d3 = add_x16(c3, c7); + int16x16 d7 = sub_x16(c3, c7); + // inv vector_butterfly 16 80 1 0 + int16x16 d1 = add_x16(c1, c5); + int16x16 d5 = sub_x16(c1, c5); + // inv vector_butterfly 32 96 1 0 + int16x16 d2 = add_x16(c2, c6); + int16x16 d6 = sub_x16(c2, c6); + // inv vector_butterfly 0 64 1 0 + int16x16 d0 = add_x16(c0, c4); + int16x16 d4 = sub_x16(c0, c4); + // inv vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 e5 = _mm256_unpacklo_epi16(d5, d7); + int16x16 e7 = _mm256_unpackhi_epi16(d5, d7); + int16x16 f5 = _mm256_unpacklo_epi16(e5, e7); + int16x16 f7 = _mm256_unpackhi_epi16(e5, e7); + int16x16 g5 = _mm256_unpacklo_epi16(f5, f7); + int16x16 g7 = _mm256_unpackhi_epi16(f5, f7); + // inv vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 e4 = _mm256_unpacklo_epi16(d4, d6); + int16x16 e6 = _mm256_unpackhi_epi16(d4, d6); + int16x16 f4 = _mm256_unpacklo_epi16(e4, e6); + int16x16 f6 = _mm256_unpackhi_epi16(e4, e6); + int16x16 g4 = _mm256_unpacklo_epi16(f4, f6); + int16x16 g6 = _mm256_unpackhi_epi16(f4, f6); + // inv vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 e1 = _mm256_unpacklo_epi16(d1, d3); + int16x16 e3 = _mm256_unpackhi_epi16(d1, d3); + int16x16 f1 = _mm256_unpacklo_epi16(e1, e3); + int16x16 f3 = _mm256_unpackhi_epi16(e1, e3); + int16x16 g1 = _mm256_unpacklo_epi16(f1, f3); + int16x16 g3 = _mm256_unpackhi_epi16(f1, f3); + // inv vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 + int16x16 e0 = _mm256_unpacklo_epi16(d0, d2); + int16x16 e2 = _mm256_unpackhi_epi16(d0, d2); + int16x16 f0 = _mm256_unpacklo_epi16(e0, e2); + int16x16 f2 = _mm256_unpackhi_epi16(e0, e2); + int16x16 g0 = _mm256_unpacklo_epi16(f0, f2); + int16x16 g2 = _mm256_unpackhi_epi16(f0, f2); + // inv vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 + g7 = mulmod_scaled_x16(g7, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + g3 = mulmod_scaled_x16(g3, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); + // inv vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 + g6 = mulmod_scaled_x16(g6, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + g2 = mulmod_scaled_x16(g2, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); + // inv vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 + g5 = mulmod_scaled_x16(g5, precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 + g1 = mulmod_scaled_x16(g1, precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); + // inv vector_reduce 64 + g4 = reduce_x16(g4, qdata); + // inv vector_reduce 0 + g0 = reduce_x16(g0, qdata); + // inv vector_butterfly 96 112 4 1 + int16x16 h6 = add_x16(g6, g7); + int16x16 h7 = sub_x16(g6, g7); + h7 = mulmod_scaled_x16(h7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 32 48 4 1 + int16x16 h2 = add_x16(g2, g3); + int16x16 h3 = sub_x16(g2, g3); + h3 = mulmod_scaled_x16(h3, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 64 80 1 0 + int16x16 h4 = add_x16(g4, g5); + int16x16 h5 = sub_x16(g4, g5); + // inv vector_butterfly 0 16 1 0 + int16x16 h0 = add_x16(g0, g1); + int16x16 h1 = sub_x16(g0, g1); + // inv vector_butterfly 80 112 1 0 + int16x16 i5 = add_x16(h5, h7); + int16x16 i7 = sub_x16(h5, h7); + // inv vector_butterfly 16 48 1 0 + int16x16 i1 = add_x16(h1, h3); + int16x16 i3 = sub_x16(h1, h3); + // inv vector_butterfly 64 96 1 0 + int16x16 i4 = add_x16(h4, h6); + int16x16 i6 = sub_x16(h4, h6); + // inv vector_butterfly 0 32 1 0 + int16x16 i0 = add_x16(h0, h2); + int16x16 i2 = sub_x16(h0, h2); + // inv startbatch 128 + _mm256_storeu_si256((int16x16 *) (f + 0), i0); + _mm256_storeu_si256((int16x16 *) (f + 16), i1); + _mm256_storeu_si256((int16x16 *) (f + 32), i2); + _mm256_storeu_si256((int16x16 *) (f + 48), i3); + _mm256_storeu_si256((int16x16 *) (f + 64), i4); + _mm256_storeu_si256((int16x16 *) (f + 80), i5); + _mm256_storeu_si256((int16x16 *) (f + 96), i6); + _mm256_storeu_si256((int16x16 *) (f + 112), i7); + f += 128; + } + f -= 128 * reps; + // inv doublereps + reps /= 2; + // inv doublereps + reps /= 2; + // inv stopbatch 512 + for (long long r = 0; r < reps; ++r) { + // inv vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432)); + int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496)); + int16x16 b27 = _mm256_permute2x128_si256_lo(a27, a31); + int16x16 b31 = _mm256_permute2x128_si256_hi(a27, a31); + // inv vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304)); + int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368)); + int16x16 b19 = _mm256_permute2x128_si256_lo(a19, a23); + int16x16 b23 = _mm256_permute2x128_si256_hi(a19, a23); + // inv vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176)); + int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240)); + int16x16 b11 = _mm256_permute2x128_si256_lo(a11, a15); + int16x16 b15 = _mm256_permute2x128_si256_hi(a11, a15); + // inv vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); + int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); + int16x16 b3 = _mm256_permute2x128_si256_lo(a3, a7); + int16x16 b7 = _mm256_permute2x128_si256_hi(a3, a7); + // inv vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + b31 = mulmod_scaled_x16(b31, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // inv vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + b27 = mulmod_scaled_x16(b27, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // inv vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + b23 = mulmod_scaled_x16(b23, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // inv vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + b19 = mulmod_scaled_x16(b19, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // inv vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + b15 = mulmod_scaled_x16(b15, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // inv vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + b11 = mulmod_scaled_x16(b11, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // inv vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 + b7 = mulmod_scaled_x16(b7, precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); + // inv vector_reduce 48 + b3 = reduce_x16(b3, qdata); + // inv vector_butterfly 432 496 8 7 + int16x16 c27 = add_x16(b27, b31); + int16x16 c31 = sub_x16(b27, b31); + c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + // inv vector_butterfly 304 368 8 1 + int16x16 c19 = add_x16(b19, b23); + int16x16 c23 = sub_x16(b19, b23); + c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + // inv vector_butterfly 176 240 4 1 + int16x16 c11 = add_x16(b11, b15); + int16x16 c15 = sub_x16(b11, b15); + c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 48 112 1 0 + int16x16 c3 = add_x16(b3, b7); + int16x16 c7 = sub_x16(b3, b7); + // inv vector_reduce_ifforward 112 + // inv vector_butterfly 368 496 4 1 + int16x16 d23 = add_x16(c23, c31); + int16x16 d31 = sub_x16(c23, c31); + d31 = mulmod_scaled_x16(d31, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 304 432 4 1 + int16x16 d19 = add_x16(c19, c27); + int16x16 d27 = sub_x16(c19, c27); + d27 = mulmod_scaled_x16(d27, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 112 240 1 0 + int16x16 d7 = add_x16(c7, c15); + int16x16 d15 = sub_x16(c7, c15); + // inv vector_butterfly 48 176 1 0 + int16x16 d3 = add_x16(c3, c11); + int16x16 d11 = sub_x16(c3, c11); + // inv vector_reduce_ifreverse 304 + d19 = reduce_x16(d19, qdata); + // inv vector_reduce_ifreverse 48 + d3 = reduce_x16(d3, qdata); + // inv vector_butterfly 240 496 1 0 + int16x16 e15 = add_x16(d15, d31); + int16x16 e31 = sub_x16(d15, d31); + // inv vector_butterfly 112 368 1 0 + int16x16 e7 = add_x16(d7, d23); + int16x16 e23 = sub_x16(d7, d23); + // inv vector_butterfly 176 432 1 0 + int16x16 e11 = add_x16(d11, d27); + int16x16 e27 = sub_x16(d11, d27); + // inv vector_butterfly 48 304 1 0 + int16x16 e3 = add_x16(d3, d19); + int16x16 e19 = sub_x16(d3, d19); + // inv startbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 48), e3); + _mm256_storeu_si256((int16x16 *) (f + 112), e7); + _mm256_storeu_si256((int16x16 *) (f + 176), e11); + _mm256_storeu_si256((int16x16 *) (f + 240), e15); + _mm256_storeu_si256((int16x16 *) (f + 304), e19); + _mm256_storeu_si256((int16x16 *) (f + 368), e23); + _mm256_storeu_si256((int16x16 *) (f + 432), e27); + _mm256_storeu_si256((int16x16 *) (f + 496), e31); + f += 512; + } + f -= 512 * reps; + // inv stopbatch 512 + for (long long r = 0; r < reps; ++r) { + // inv vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416)); + int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480)); + int16x16 b26 = _mm256_permute2x128_si256_lo(a26, a30); + int16x16 b30 = _mm256_permute2x128_si256_hi(a26, a30); + // inv vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288)); + int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352)); + int16x16 b18 = _mm256_permute2x128_si256_lo(a18, a22); + int16x16 b22 = _mm256_permute2x128_si256_hi(a18, a22); + // inv vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160)); + int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224)); + int16x16 b10 = _mm256_permute2x128_si256_lo(a10, a14); + int16x16 b14 = _mm256_permute2x128_si256_hi(a10, a14); + // inv vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); + int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); + int16x16 b2 = _mm256_permute2x128_si256_lo(a2, a6); + int16x16 b6 = _mm256_permute2x128_si256_hi(a2, a6); + // inv vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + b30 = mulmod_scaled_x16(b30, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // inv vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + b26 = mulmod_scaled_x16(b26, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // inv vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + b22 = mulmod_scaled_x16(b22, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // inv vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + b18 = mulmod_scaled_x16(b18, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // inv vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + b14 = mulmod_scaled_x16(b14, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // inv vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + b10 = mulmod_scaled_x16(b10, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // inv vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 + b6 = mulmod_scaled_x16(b6, precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); + // inv vector_reduce 32 + b2 = reduce_x16(b2, qdata); + // inv vector_butterfly 416 480 8 7 + int16x16 c26 = add_x16(b26, b30); + int16x16 c30 = sub_x16(b26, b30); + c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + // inv vector_butterfly 288 352 8 1 + int16x16 c18 = add_x16(b18, b22); + int16x16 c22 = sub_x16(b18, b22); + c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + // inv vector_butterfly 160 224 4 1 + int16x16 c10 = add_x16(b10, b14); + int16x16 c14 = sub_x16(b10, b14); + c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 32 96 1 0 + int16x16 c2 = add_x16(b2, b6); + int16x16 c6 = sub_x16(b2, b6); + // inv vector_reduce_ifforward 96 + // inv vector_butterfly 352 480 4 1 + int16x16 d22 = add_x16(c22, c30); + int16x16 d30 = sub_x16(c22, c30); + d30 = mulmod_scaled_x16(d30, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 288 416 4 1 + int16x16 d18 = add_x16(c18, c26); + int16x16 d26 = sub_x16(c18, c26); + d26 = mulmod_scaled_x16(d26, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 96 224 1 0 + int16x16 d6 = add_x16(c6, c14); + int16x16 d14 = sub_x16(c6, c14); + // inv vector_butterfly 32 160 1 0 + int16x16 d2 = add_x16(c2, c10); + int16x16 d10 = sub_x16(c2, c10); + // inv vector_reduce_ifreverse 288 + d18 = reduce_x16(d18, qdata); + // inv vector_reduce_ifreverse 32 + d2 = reduce_x16(d2, qdata); + // inv vector_butterfly 224 480 1 0 + int16x16 e14 = add_x16(d14, d30); + int16x16 e30 = sub_x16(d14, d30); + // inv vector_butterfly 96 352 1 0 + int16x16 e6 = add_x16(d6, d22); + int16x16 e22 = sub_x16(d6, d22); + // inv vector_butterfly 160 416 1 0 + int16x16 e10 = add_x16(d10, d26); + int16x16 e26 = sub_x16(d10, d26); + // inv vector_butterfly 32 288 1 0 + int16x16 e2 = add_x16(d2, d18); + int16x16 e18 = sub_x16(d2, d18); + // inv startbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 32), e2); + _mm256_storeu_si256((int16x16 *) (f + 96), e6); + _mm256_storeu_si256((int16x16 *) (f + 160), e10); + _mm256_storeu_si256((int16x16 *) (f + 224), e14); + _mm256_storeu_si256((int16x16 *) (f + 288), e18); + _mm256_storeu_si256((int16x16 *) (f + 352), e22); + _mm256_storeu_si256((int16x16 *) (f + 416), e26); + _mm256_storeu_si256((int16x16 *) (f + 480), e30); + f += 512; + } + f -= 512 * reps; + // inv stopbatch 512 + for (long long r = 0; r < reps; ++r) { + // inv vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400)); + int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464)); + int16x16 b25 = _mm256_permute2x128_si256_lo(a25, a29); + int16x16 b29 = _mm256_permute2x128_si256_hi(a25, a29); + // inv vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272)); + int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336)); + int16x16 b17 = _mm256_permute2x128_si256_lo(a17, a21); + int16x16 b21 = _mm256_permute2x128_si256_hi(a17, a21); + // inv vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144)); + int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208)); + int16x16 b9 = _mm256_permute2x128_si256_lo(a9, a13); + int16x16 b13 = _mm256_permute2x128_si256_hi(a9, a13); + // inv vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); + int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); + int16x16 b1 = _mm256_permute2x128_si256_lo(a1, a5); + int16x16 b5 = _mm256_permute2x128_si256_hi(a1, a5); + // inv vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + b29 = mulmod_scaled_x16(b29, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // inv vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + b25 = mulmod_scaled_x16(b25, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // inv vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + b21 = mulmod_scaled_x16(b21, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // inv vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + b17 = mulmod_scaled_x16(b17, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // inv vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + b13 = mulmod_scaled_x16(b13, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // inv vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + b9 = mulmod_scaled_x16(b9, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // inv vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 + b5 = mulmod_scaled_x16(b5, precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); + // inv vector_reduce 16 + b1 = reduce_x16(b1, qdata); + // inv vector_butterfly 400 464 8 7 + int16x16 c25 = add_x16(b25, b29); + int16x16 c29 = sub_x16(b25, b29); + c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + // inv vector_butterfly 272 336 8 1 + int16x16 c17 = add_x16(b17, b21); + int16x16 c21 = sub_x16(b17, b21); + c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + // inv vector_butterfly 144 208 4 1 + int16x16 c9 = add_x16(b9, b13); + int16x16 c13 = sub_x16(b9, b13); + c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 16 80 1 0 + int16x16 c1 = add_x16(b1, b5); + int16x16 c5 = sub_x16(b1, b5); + // inv vector_reduce_ifforward 80 + // inv vector_butterfly 336 464 4 1 + int16x16 d21 = add_x16(c21, c29); + int16x16 d29 = sub_x16(c21, c29); + d29 = mulmod_scaled_x16(d29, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 272 400 4 1 + int16x16 d17 = add_x16(c17, c25); + int16x16 d25 = sub_x16(c17, c25); + d25 = mulmod_scaled_x16(d25, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 80 208 1 0 + int16x16 d5 = add_x16(c5, c13); + int16x16 d13 = sub_x16(c5, c13); + // inv vector_butterfly 16 144 1 0 + int16x16 d1 = add_x16(c1, c9); + int16x16 d9 = sub_x16(c1, c9); + // inv vector_reduce_ifreverse 272 + d17 = reduce_x16(d17, qdata); + // inv vector_reduce_ifreverse 16 + d1 = reduce_x16(d1, qdata); + // inv vector_butterfly 208 464 1 0 + int16x16 e13 = add_x16(d13, d29); + int16x16 e29 = sub_x16(d13, d29); + // inv vector_butterfly 80 336 1 0 + int16x16 e5 = add_x16(d5, d21); + int16x16 e21 = sub_x16(d5, d21); + // inv vector_butterfly 144 400 1 0 + int16x16 e9 = add_x16(d9, d25); + int16x16 e25 = sub_x16(d9, d25); + // inv vector_butterfly 16 272 1 0 + int16x16 e1 = add_x16(d1, d17); + int16x16 e17 = sub_x16(d1, d17); + // inv startbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 16), e1); + _mm256_storeu_si256((int16x16 *) (f + 80), e5); + _mm256_storeu_si256((int16x16 *) (f + 144), e9); + _mm256_storeu_si256((int16x16 *) (f + 208), e13); + _mm256_storeu_si256((int16x16 *) (f + 272), e17); + _mm256_storeu_si256((int16x16 *) (f + 336), e21); + _mm256_storeu_si256((int16x16 *) (f + 400), e25); + _mm256_storeu_si256((int16x16 *) (f + 464), e29); + f += 512; + } + f -= 512 * reps; + // inv stopbatch 512 + for (long long r = 0; r < reps; ++r) { + // inv vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384)); + int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448)); + int16x16 b24 = _mm256_permute2x128_si256_lo(a24, a28); + int16x16 b28 = _mm256_permute2x128_si256_hi(a24, a28); + // inv vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256)); + int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320)); + int16x16 b16 = _mm256_permute2x128_si256_lo(a16, a20); + int16x16 b20 = _mm256_permute2x128_si256_hi(a16, a20); + // inv vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128)); + int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192)); + int16x16 b8 = _mm256_permute2x128_si256_lo(a8, a12); + int16x16 b12 = _mm256_permute2x128_si256_hi(a8, a12); + // inv vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi + int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); + int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); + int16x16 b0 = _mm256_permute2x128_si256_lo(a0, a4); + int16x16 b4 = _mm256_permute2x128_si256_hi(a0, a4); + // inv vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + b28 = mulmod_scaled_x16(b28, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + b24 = mulmod_scaled_x16(b24, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + b20 = mulmod_scaled_x16(b20, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + b16 = mulmod_scaled_x16(b16, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + b12 = mulmod_scaled_x16(b12, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + b8 = mulmod_scaled_x16(b8, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // inv vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + b4 = mulmod_scaled_x16(b4, precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); + // inv vector_reduce 0 + b0 = reduce_x16(b0, qdata); + // inv vector_butterfly 384 448 8 7 + int16x16 c24 = add_x16(b24, b28); + int16x16 c28 = sub_x16(b24, b28); + c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); + // inv vector_butterfly 256 320 8 1 + int16x16 c16 = add_x16(b16, b20); + int16x16 c20 = sub_x16(b16, b20); + c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); + // inv vector_butterfly 128 192 4 1 + int16x16 c8 = add_x16(b8, b12); + int16x16 c12 = sub_x16(b8, b12); + c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 0 64 1 0 + int16x16 c0 = add_x16(b0, b4); + int16x16 c4 = sub_x16(b0, b4); + // inv vector_reduce_ifforward 64 + // inv vector_butterfly 320 448 4 1 + int16x16 d20 = add_x16(c20, c28); + int16x16 d28 = sub_x16(c20, c28); + d28 = mulmod_scaled_x16(d28, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 256 384 4 1 + int16x16 d16 = add_x16(c16, c24); + int16x16 d24 = sub_x16(c16, c24); + d24 = mulmod_scaled_x16(d24, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); + // inv vector_butterfly 64 192 1 0 + int16x16 d4 = add_x16(c4, c12); + int16x16 d12 = sub_x16(c4, c12); + // inv vector_butterfly 0 128 1 0 + int16x16 d0 = add_x16(c0, c8); + int16x16 d8 = sub_x16(c0, c8); + // inv vector_reduce_ifreverse 256 + d16 = reduce_x16(d16, qdata); + // inv vector_reduce_ifreverse 0 + d0 = reduce_x16(d0, qdata); + // inv vector_butterfly 192 448 1 0 + int16x16 e12 = add_x16(d12, d28); + int16x16 e28 = sub_x16(d12, d28); + // inv vector_butterfly 64 320 1 0 + int16x16 e4 = add_x16(d4, d20); + int16x16 e20 = sub_x16(d4, d20); + // inv vector_butterfly 128 384 1 0 + int16x16 e8 = add_x16(d8, d24); + int16x16 e24 = sub_x16(d8, d24); + // inv vector_butterfly 0 256 1 0 + int16x16 e0 = add_x16(d0, d16); + int16x16 e16 = sub_x16(d0, d16); + // inv startbatch 512 + _mm256_storeu_si256((int16x16 *) (f + 0), e0); + _mm256_storeu_si256((int16x16 *) (f + 64), e4); + _mm256_storeu_si256((int16x16 *) (f + 128), e8); + _mm256_storeu_si256((int16x16 *) (f + 192), e12); + _mm256_storeu_si256((int16x16 *) (f + 256), e16); + _mm256_storeu_si256((int16x16 *) (f + 320), e20); + _mm256_storeu_si256((int16x16 *) (f + 384), e24); + _mm256_storeu_si256((int16x16 *) (f + 448), e28); + f += 512; + } + // f -= 512*reps; + // inv startntt 512 +} + +void PQCLEAN_SNTRUP761_AVX2_invntt512_7681(int16 *f, int reps) { + invntt512(f, reps, qdata_7681.data); +} + +void PQCLEAN_SNTRUP761_AVX2_invntt512_10753(int16 *f, int reps) { + invntt512(f, reps, qdata_10753.data); +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.h new file mode 100644 index 0000000000..8005ff818d --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_multsntrup761_ntt.h @@ -0,0 +1,13 @@ +#ifndef ntt_H +#define ntt_H + +#include + + + +extern void PQCLEAN_SNTRUP761_AVX2_ntt512_7681(int16_t *f, int reps); +extern void PQCLEAN_SNTRUP761_AVX2_ntt512_10753(int16_t *f, int reps); +extern void PQCLEAN_SNTRUP761_AVX2_invntt512_7681(int16_t *f, int reps); +extern void PQCLEAN_SNTRUP761_AVX2_invntt512_10753(int16_t *f, int reps); + +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.c new file mode 100644 index 0000000000..477fe04169 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.c @@ -0,0 +1,47 @@ +#include "crypto_core_scale3sntrup761.h" +#include "crypto_decode_761xint16.h" +#include "crypto_encode_761xint16.h" +#include + +#define p 761 +#define q 4591 + +#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16 +#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16 + +typedef int16_t Fq; + +/* out = 3*in in Rq */ +int PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + int i = p - 16; + + __m256i save = _mm256_loadu_si256((__m256i *) (inbytes + 2 * i)); + /* in case outbytes = inbytes */ + + for (;;) { + do { + __m256i x = _mm256_loadu_si256((__m256i *) inbytes); + __m256i xneg; + x = _mm256_mullo_epi16(x, _mm256_set1_epi16(3)); + x = _mm256_sub_epi16(x, _mm256_set1_epi16((q + 1) / 2)); + xneg = _mm256_srai_epi16(x, 15); + x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg); + xneg = _mm256_srai_epi16(x, 15); + x = _mm256_add_epi16(x, _mm256_set1_epi16(q)&xneg); + x = _mm256_sub_epi16(x, _mm256_set1_epi16((q - 1) / 2)); + _mm256_storeu_si256((__m256i *) outbytes, x); + + inbytes += 32; + outbytes += 32; + i -= 16; + } while (i >= 0); + if (i <= -16) { + break; + } + inbytes += 2 * i; + outbytes += 2 * i; + _mm256_storeu_si256((__m256i *) outbytes, save); + } + + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.h new file mode 100644 index 0000000000..954872f7cd --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_scale3sntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_SCALE3SNTRUP761_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_SCALE3SNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_OUTPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_INPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.c new file mode 100644 index 0000000000..5d1b85e37a --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.c @@ -0,0 +1,45 @@ +#include "crypto_core_weightsntrup761.h" +#include "crypto_encode_int16.h" +#include "params.h" +#include + +#define int8 int8_t +#define int16 int16_t + + +/* out = little-endian weight of bottom bits of in */ +int PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + int8 *in = (void *) inbytes; + int i; + __m256i sum, sumhi; + int16 weight; + + sum = _mm256_loadu_si256((__m256i *) (in + p - 32)); + sum &= endingmask; + + for (i = p - 32; i >= 0; i -= 32) { + __m256i bits = _mm256_loadu_si256((__m256i *) in); + bits &= _mm256_set1_epi8(1); + sum = _mm256_add_epi8(sum, bits); + in += 32; + } + + /* sum is 32xint8; want to add these int8 */ + sumhi = _mm256_srli_epi16(sum, 8); + sum &= _mm256_set1_epi16(0xff); + sum = _mm256_add_epi16(sum, sumhi); + + /* sum is 16xint16; want to add these int16 */ + sum = _mm256_hadd_epi16(sum, sum); + /* want sum[0]+sum[1]+sum[2]+sum[3]+sum[8]+sum[9]+sum[10]+sum[11] */ + sum = _mm256_hadd_epi16(sum, sum); + /* want sum[0]+sum[1]+sum[8]+sum[9] */ + sum = _mm256_hadd_epi16(sum, sum); + /* want sum[0]+sum[8] */ + + weight = (int16) _mm256_extract_epi16(sum, 0); + weight += (int16) _mm256_extract_epi16(sum, 8); + + PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16(outbytes, &weight); + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.h new file mode 100644 index 0000000000..a2e3cd44b6 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_weightsntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WEIGHTSNTRUP761_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WEIGHTSNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_OUTPUTBYTES 2 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.c new file mode 100644 index 0000000000..0dedf0af99 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.c @@ -0,0 +1,61 @@ +#include "crypto_core_wforcesntrup761.h" +#include "crypto_decode_int16.h" +#include "params.h" +#include + +#define int16 int16_t + + +/* out = in if bottom bits of in have weight w */ +/* otherwise out = (1,1,...,1,0,0,...,0) */ +int PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761(unsigned char *out, const unsigned char *in) { + int16 weight; + int16 mask; + __m256i maskvec; + int i; + + crypto_core_weight((unsigned char *) &weight, in); + PQCLEAN_SNTRUP761_AVX2_crypto_decode_int16(&weight, (unsigned char *) &weight); + + mask = (int16) ((weight - w) | (w - weight)); + mask >>= 15; + maskvec = _mm256_set1_epi16((short) ~mask); + + i = w - 32; + for (;;) { + do { + __m256i x = _mm256_loadu_si256((__m256i *) in); + x ^= _mm256_set1_epi8(1); + x &= maskvec; + x ^= _mm256_set1_epi8(1); + _mm256_storeu_si256((__m256i *) out, x); + in += 32; + out += 32; + i -= 32; + } while (i >= 0); + if (i <= -32) { + break; + } + in += i; + out += i; + } + + i = p - w - 32; + for (;;) { + do { + __m256i x = _mm256_loadu_si256((__m256i *) in); + x &= maskvec; + _mm256_storeu_si256((__m256i *) out, x); + in += 32; + out += 32; + i -= 32; + } while (i >= 0); + if (i <= -32) { + break; + } + in += i; + out += i; + } + + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.h new file mode 100644 index 0000000000..78876a5409 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_core_wforcesntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WFORCESNTRUP761_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_CORE_WFORCESNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_OUTPUTBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761(unsigned char *out, const unsigned char *in); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_declassify.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_declassify.h new file mode 100644 index 0000000000..a67a915c51 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_declassify.h @@ -0,0 +1,8 @@ +#ifndef crypto_declassify_h +#define crypto_declassify_h + + + +#define crypto_declassify(BUF,SIZE) + +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.c new file mode 100644 index 0000000000..aa567462c9 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.c @@ -0,0 +1,432 @@ +#include "crypto_decode_761x1531.h" +#include +/* auto-generated; do not edit */ + +#define int16 int16_t +#define int32 int32_t + +static inline int16 mullo(int16 x, int16 y) { + return x * y; +} + +static inline int16 mulhi(int16 x, int16 y) { + return (int16) ((x * (int32)y) >> 16); +} + +static inline __m256i add(__m256i x, __m256i y) { + return _mm256_add_epi16(x, y); +} + +static inline __m256i sub(__m256i x, __m256i y) { + return _mm256_sub_epi16(x, y); +} + +static inline __m256i shiftleftconst(__m256i x, int16 y) { + return _mm256_slli_epi16(x, y); +} + +static inline __m256i signedshiftrightconst(__m256i x, int16 y) { + return _mm256_srai_epi16(x, y); +} + +static inline __m256i subconst(__m256i x, int16 y) { + return sub(x, _mm256_set1_epi16(y)); +} + +static inline __m256i mulloconst(__m256i x, int16 y) { + return _mm256_mullo_epi16(x, _mm256_set1_epi16(y)); +} + +static inline __m256i mulhiconst(__m256i x, int16 y) { + return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y)); +} + +static inline __m256i ifgesubconst(__m256i x, int16 y) { + __m256i y16 = _mm256_set1_epi16(y); + __m256i top16 = _mm256_set1_epi16((int16)(y - 1)); + return sub(x, _mm256_cmpgt_epi16(x, top16) & y16); +} + +static inline __m256i ifnegaddconst(__m256i x, int16 y) { + return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y)); +} + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s) { + int16 *R0 = v; + int16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1]; + long long i; + int16 a0, a1, a2; + __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1; + + s += PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_STRBYTES; + a1 = 0; + a1 += *--s; /* 0...255 */ + a1 = mulhi(a1, -84) - mulhi(mullo(a1, -4828), 3475); + a1 += *--s; /* -1738...1992 */ + a1 += (a1 >> 15) & 3475; /* 0...3474 */ + R10[0] = a1; + + /* R10 ------> R9: reconstruct mod 1*[593]+[1500] */ + + i = 0; + s -= 1; + a2 = a0 = R10[0]; + a0 = mulhi(a0, 60) - mulhi(mullo(a0, -28292), 593); /* -297...311 */ + a0 += s[1 * i + 0]; /* -297...566 */ + a0 += (a0 >> 15) & 593; /* 0...592 */ + a1 = (int16) ((a2 << 8) + s[i] - a0); + a1 = mullo(a1, -31055); + + /* invalid inputs might need reduction mod 1500 */ + a1 -= 1500; + a1 += (a1 >> 15) & 1500; + + R9[0] = a0; + R9[1] = a1; + s -= 0; + + /* R9 ------> R8: reconstruct mod 2*[6232]+[1500] */ + + R8[2] = R9[1]; + s -= 2; + for (i = 0; i >= 0; --i) { + a2 = a0 = R9[i]; + a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3116...3284 */ + a0 += s[2 * i + 1]; /* -3116...3539 */ + a0 = mulhi(a0, 672) - mulhi(mullo(a0, -2692), 6232); /* -3148...3152 */ + a0 += s[2 * i + 0]; /* -3148...3407 */ + a0 += (a0 >> 15) & 6232; /* 0...6231 */ + a1 = (int16) ((a2 << 13) + (s[2 * i + 1] << 5) + ((s[2 * i] - a0) >> 3)); + a1 = mullo(a1, 12451); + + /* invalid inputs might need reduction mod 6232 */ + a1 -= 6232; + a1 += (a1 >> 15) & 6232; + + R8[2 * i] = a0; + R8[2 * i + 1] = a1; + } + + /* R8 ------> R7: reconstruct mod 5*[1263]+[304] */ + + i = 0; + s -= 1; + a2 = a0 = R8[2]; + a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */ + a0 += s[1 * i + 0]; /* -751...886 */ + a0 += (a0 >> 15) & 1263; /* 0...1262 */ + a1 = (int16) ((a2 << 8) + s[i] - a0); + a1 = mullo(a1, -22001); + + /* invalid inputs might need reduction mod 304 */ + a1 -= 304; + a1 += (a1 >> 15) & 304; + + R7[4] = a0; + R7[5] = a1; + s -= 2; + for (i = 1; i >= 0; --i) { + a2 = a0 = R8[i]; + a0 = mulhi(a0, -476) - mulhi(mullo(a0, -13284), 1263); /* -751...631 */ + a0 += s[1 * i + 0]; /* -751...886 */ + a0 += (a0 >> 15) & 1263; /* 0...1262 */ + a1 = (int16) ((a2 << 8) + s[i] - a0); + a1 = mullo(a1, -22001); + + /* invalid inputs might need reduction mod 1263 */ + a1 -= 1263; + a1 += (a1 >> 15) & 1263; + + R7[2 * i] = a0; + R7[2 * i + 1] = a1; + } + + /* R7 ------> R6: reconstruct mod 11*[9097]+[2188] */ + + i = 0; + s -= 2; + a0 = R7[5]; + a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */ + a0 += s[2 * i + 1]; /* -4549...5390 */ + a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */ + a0 += s[2 * i + 0]; /* -4712...4996 */ + a0 += (a0 >> 15) & 9097; /* 0...9096 */ + a1 = (int16) ((s[2 * i + 1] << 8) + s[2 * i] - a0); + a1 = mullo(a1, 17081); + + /* invalid inputs might need reduction mod 2188 */ + a1 -= 2188; + a1 += (a1 >> 15) & 2188; + + R6[10] = a0; + R6[11] = a1; + s -= 10; + for (i = 4; i >= 0; --i) { + a0 = R7[i]; + a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4549...5135 */ + a0 += s[2 * i + 1]; /* -4549...5390 */ + a0 = mulhi(a0, 2348) - mulhi(mullo(a0, -1844), 9097); /* -4712...4741 */ + a0 += s[2 * i + 0]; /* -4712...4996 */ + a0 += (a0 >> 15) & 9097; /* 0...9096 */ + a1 = (int16) ((s[2 * i + 1] << 8) + s[2 * i] - a0); + a1 = mullo(a1, 17081); + + /* invalid inputs might need reduction mod 9097 */ + a1 -= 9097; + a1 += (a1 >> 15) & 9097; + + R6[2 * i] = a0; + R6[2 * i + 1] = a1; + } + + /* R6 ------> R5: reconstruct mod 23*[1526]+[367] */ + + i = 0; + s -= 1; + a2 = a0 = R6[11]; + a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */ + a0 += s[1 * i + 0]; /* -763...1111 */ + a0 += (a0 >> 15) & 1526; /* 0...1525 */ + a1 = (int16) ((a2 << 7) + ((s[i] - a0) >> 1)); + a1 = mullo(a1, -18381); + + /* invalid inputs might need reduction mod 367 */ + a1 -= 367; + a1 += (a1 >> 15) & 367; + + R5[22] = a0; + R5[23] = a1; + s -= 11; + for (i = 10; i >= 0; --i) { + a2 = a0 = R6[i]; + a0 = mulhi(a0, 372) - mulhi(mullo(a0, -10994), 1526); /* -763...856 */ + a0 += s[1 * i + 0]; /* -763...1111 */ + a0 += (a0 >> 15) & 1526; /* 0...1525 */ + a1 = (int16) ((a2 << 7) + ((s[i] - a0) >> 1)); + a1 = mullo(a1, -18381); + + /* invalid inputs might need reduction mod 1526 */ + a1 -= 1526; + a1 += (a1 >> 15) & 1526; + + R5[2 * i] = a0; + R5[2 * i + 1] = a1; + } + + /* R5 ------> R4: reconstruct mod 47*[625]+[150] */ + + i = 0; + s -= 1; + a2 = a0 = R5[23]; + a0 = mulhi(a0, -284) - mulhi(mullo(a0, -26844), 625); /* -384...312 */ + a0 += s[1 * i + 0]; /* -384...567 */ + a0 += (a0 >> 15) & 625; /* 0...624 */ + a1 = (int16) ((a2 << 8) + s[i] - a0); + a1 = mullo(a1, 32401); + + /* invalid inputs might need reduction mod 150 */ + a1 -= 150; + a1 += (a1 >> 15) & 150; + + R4[46] = a0; + R4[47] = a1; + s -= 23; + i = 7; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]); + S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); + A0 = sub(mulhiconst(A0, -284), mulhiconst(mulloconst(A0, -26844), 625)); /* -384...312 */ + A0 = add(A0, S0); /* -384...567 */ + A0 = ifnegaddconst(A0, 625); /* 0...624 */ + A1 = add(shiftleftconst(A2, 8), sub(S0, A0)); + A1 = mulloconst(A1, 32401); + + /* invalid inputs might need reduction mod 625 */ + A1 = ifgesubconst(A1, 625); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R4 ------> R3: reconstruct mod 95*[6400]+[1531] */ + + i = 0; + s -= 2; + a2 = a0 = R4[47]; + a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3200...3904 */ + a0 += s[2 * i + 1]; /* -3200...4159 */ + a0 = mulhi(a0, 2816) - mulhi(mullo(a0, -2621), 6400); /* -3338...3378 */ + a0 += s[2 * i + 0]; /* -3338...3633 */ + a0 += (a0 >> 15) & 6400; /* 0...6399 */ + a1 = (int16) ((a2 << 8) + s[2 * i + 1] + ((s[2 * i] - a0) >> 8)); + a1 = mullo(a1, 23593); + + /* invalid inputs might need reduction mod 1531 */ + a1 -= 1531; + a1 += (a1 >> 15) & 1531; + + R3[94] = a0; + R3[95] = a1; + s -= 94; + i = 31; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]); + S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i)); + S1 = _mm256_srli_epi16(S0, 8); + S0 &= _mm256_set1_epi16(255); + A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3200...3904 */ + A0 = add(A0, S1); /* -3200...4159 */ + A0 = sub(mulhiconst(A0, 2816), mulhiconst(mulloconst(A0, -2621), 6400)); /* -3338...3378 */ + A0 = add(A0, S0); /* -3338...3633 */ + A0 = ifnegaddconst(A0, 6400); /* 0...6399 */ + A1 = add(add(shiftleftconst(A2, 8), S1), signedshiftrightconst(sub(S0, A0), 8)); + A1 = mulloconst(A1, 23593); + + /* invalid inputs might need reduction mod 6400 */ + A1 = ifgesubconst(A1, 6400); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R3 ------> R2: reconstruct mod 190*[1280]+[1531] */ + + R2[190] = R3[95]; + s -= 95; + i = 79; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]); + S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); + A0 = sub(mulhiconst(A0, 256), mulhiconst(mulloconst(A0, -13107), 1280)); /* -640...704 */ + A0 = add(A0, S0); /* -640...959 */ + A0 = ifnegaddconst(A0, 1280); /* 0...1279 */ + A1 = add(A2, signedshiftrightconst(sub(S0, A0), 8)); + A1 = mulloconst(A1, -13107); + + /* invalid inputs might need reduction mod 1280 */ + A1 = ifgesubconst(A1, 1280); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R2 ------> R1: reconstruct mod 380*[9157]+[1531] */ + + R1[380] = R2[190]; + s -= 380; + i = 174; + for (;;) { + A0 = _mm256_loadu_si256((__m256i *) &R2[i]); + S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i)); + S1 = _mm256_srli_epi16(S0, 8); + S0 &= _mm256_set1_epi16(255); + A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4579...4976 */ + A0 = add(A0, S1); /* -4579...5231 */ + A0 = sub(mulhiconst(A0, 1592), mulhiconst(mulloconst(A0, -1832), 9157)); /* -4690...4705 */ + A0 = add(A0, S0); /* -4690...4960 */ + A0 = ifnegaddconst(A0, 9157); /* 0...9156 */ + A1 = add(shiftleftconst(S1, 8), sub(S0, A0)); + A1 = mulloconst(A1, 25357); + + /* invalid inputs might need reduction mod 9157 */ + A1 = ifgesubconst(A1, 9157); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R1 ------> R0: reconstruct mod 761*[1531] */ + + R0[760] = (int16) (3 * R1[380] - 2295); + s -= 380; + i = 364; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R1[i]); + S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); + A0 = sub(mulhiconst(A0, 518), mulhiconst(mulloconst(A0, -10958), 1531)); /* -766...895 */ + A0 = add(A0, S0); /* -766...1150 */ + A0 = ifnegaddconst(A0, 1531); /* 0...1530 */ + A1 = add(shiftleftconst(A2, 8), sub(S0, A0)); + A1 = mulloconst(A1, 15667); + + /* invalid inputs might need reduction mod 1531 */ + A1 = ifgesubconst(A1, 1531); + + A0 = mulloconst(A0, 3); + A1 = mulloconst(A1, 3); + A0 = subconst(A0, 2295); + A1 = subconst(A1, 2295); + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.h new file mode 100644 index 0000000000..02ee10a81d --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x1531.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X1531_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X1531_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_STRBYTES 1007 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.c new file mode 100644 index 0000000000..73b0f0928c --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.c @@ -0,0 +1,65 @@ +#include "crypto_decode_761x3.h" +#include +#define uint8 uint8_t + +#define p 761 +#define loops 6 +#define overshoot 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s) { + uint8 *f = v; + int loop; + uint8 *nextf = f + 128 - 4 * overshoot; + const unsigned char *nexts = s + 32 - overshoot; + + for (loop = loops; loop > 0; --loop) { + __m256i s0 = _mm256_loadu_si256((const __m256i *) s); + s = nexts; + nexts += 32; + + __m256i s1 = _mm256_srli_epi16(s0 & _mm256_set1_epi8(-16), 4); + s0 &= _mm256_set1_epi8(15); + + __m256i a0 = _mm256_unpacklo_epi8(s0, s1); + /* 0 0>>4 1 1>>4 2 2>>4 3 3>>4 4 4>>4 5 5>>4 6 6>>4 7 7>>4 */ + /* 16 16>>4 ... */ + __m256i a1 = _mm256_unpackhi_epi8(s0, s1); + /* 8 8>>4 9 9>>4 10 10>>4 ... */ + /* 24 24>>4 ... */ + + __m256i a2 = _mm256_srli_epi16(a0 & _mm256_set1_epi8(12), 2); + __m256i a3 = _mm256_srli_epi16(a1 & _mm256_set1_epi8(12), 2); + a0 &= _mm256_set1_epi8(3); + a1 &= _mm256_set1_epi8(3); + + __m256i b0 = _mm256_unpacklo_epi8(a0, a2); + /* 0 0>>2 0>>4 0>>6 1 1>>2 1>>4 1>>6 */ + /* 2 2>>2 2>>4 2>>6 3 3>>2 3>>4 3>.6 */ + /* 16 16>>2 16>>4 16>>6 ... */ + __m256i b2 = _mm256_unpackhi_epi8(a0, a2); + /* 4 4>>2 ... */ + __m256i b1 = _mm256_unpacklo_epi8(a1, a3); + /* 8 8>>2 ... */ + __m256i b3 = _mm256_unpackhi_epi8(a1, a3); + /* 12 12>>2 ... */ + + __m256i f0 = _mm256_permute2x128_si256(b0, b2, 0x20); + __m256i f2 = _mm256_permute2x128_si256(b0, b2, 0x31); + __m256i f1 = _mm256_permute2x128_si256(b1, b3, 0x20); + __m256i f3 = _mm256_permute2x128_si256(b1, b3, 0x31); + + f0 = _mm256_add_epi8(f0, _mm256_set1_epi8(-1)); + f1 = _mm256_add_epi8(f1, _mm256_set1_epi8(-1)); + f2 = _mm256_add_epi8(f2, _mm256_set1_epi8(-1)); + f3 = _mm256_add_epi8(f3, _mm256_set1_epi8(-1)); + + _mm256_storeu_si256((__m256i *) (f + 0), f0); + _mm256_storeu_si256((__m256i *) (f + 32), f1); + _mm256_storeu_si256((__m256i *) (f + 64), f2); + _mm256_storeu_si256((__m256i *) (f + 96), f3); + f = nextf; + nextf += 128; + } + + *f = (uint8) ((*s & 3) - 1); +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.h new file mode 100644 index 0000000000..f72e26ade6 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x3.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X3_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X3_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3_STRBYTES 191 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3_ITEMBYTES 1 + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.c new file mode 100644 index 0000000000..ea31ac7982 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.c @@ -0,0 +1,432 @@ +#include "crypto_decode_761x4591.h" +#include +/* auto-generated; do not edit */ + +#define int16 int16_t +#define int32 int32_t + +static inline int16 mullo(int16 x, int16 y) { + return x * y; +} + +static inline int16 mulhi(int16 x, int16 y) { + return (int16) ((x * (int32)y) >> 16); +} + +static inline __m256i add(__m256i x, __m256i y) { + return _mm256_add_epi16(x, y); +} + +static inline __m256i sub(__m256i x, __m256i y) { + return _mm256_sub_epi16(x, y); +} + +static inline __m256i shiftleftconst(__m256i x, int16 y) { + return _mm256_slli_epi16(x, y); +} + +static inline __m256i signedshiftrightconst(__m256i x, int16 y) { + return _mm256_srai_epi16(x, y); +} + +static inline __m256i subconst(__m256i x, int16 y) { + return sub(x, _mm256_set1_epi16(y)); +} + +static inline __m256i mulloconst(__m256i x, int16 y) { + return _mm256_mullo_epi16(x, _mm256_set1_epi16(y)); +} + +static inline __m256i mulhiconst(__m256i x, int16 y) { + return _mm256_mulhi_epi16(x, _mm256_set1_epi16(y)); +} + +static inline __m256i ifgesubconst(__m256i x, int16 y) { + __m256i y16 = _mm256_set1_epi16(y); + __m256i top16 = _mm256_set1_epi16((int16)(y - 1)); + return sub(x, _mm256_cmpgt_epi16(x, top16) & y16); +} + +static inline __m256i ifnegaddconst(__m256i x, int16 y) { + return add(x, signedshiftrightconst(x, 15) & _mm256_set1_epi16(y)); +} + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591(void *v, const unsigned char *s) { + int16 *R0 = v; + int16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1]; + long long i; + int16 a0, a1, a2; + __m256i A0, A1, A2, S0, S1, B0, B1, C0, C1; + + s += PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_STRBYTES; + a1 = 0; + a1 += *--s; /* 0...255 */ + a1 = mulhi(a1, -656) - mulhi(mullo(a1, -10434), 1608); + a1 += *--s; /* -804...1056 */ + a1 += (a1 >> 15) & 1608; /* 0...1607 */ + R10[0] = a1; + + /* R10 ------> R9: reconstruct mod 1*[9470]+[11127] */ + + i = 0; + s -= 2; + a2 = a0 = R10[0]; + a0 = mulhi(a0, -3624) - mulhi(mullo(a0, -1772), 9470); /* -5641...4735 */ + a0 += s[2 * i + 1]; /* -5641...4990 */ + a0 = mulhi(a0, -3624) - mulhi(mullo(a0, -1772), 9470); /* -5011...5046 */ + a0 += s[2 * i + 0]; /* -5011...5301 */ + a0 += (a0 >> 15) & 9470; /* 0...9469 */ + a1 = (int16) ((a2 << 15) + (s[2 * i + 1] << 7) + ((s[2 * i] - a0) >> 1)); + a1 = mullo(a1, -21121); + + /* invalid inputs might need reduction mod 11127 */ + a1 -= 11127; + a1 += (a1 >> 15) & 11127; + + R9[0] = a0; + R9[1] = a1; + s -= 0; + + /* R9 ------> R8: reconstruct mod 2*[1557]+[11127] */ + + R8[2] = R9[1]; + s -= 1; + for (i = 0; i >= 0; --i) { + a2 = a0 = R9[i]; + a0 = mulhi(a0, 541) - mulhi(mullo(a0, -10775), 1557); /* -779...913 */ + a0 += s[1 * i + 0]; /* -779...1168 */ + a0 += (a0 >> 15) & 1557; /* 0...1556 */ + a1 = (int16) ((a2 << 8) + s[i] - a0); + a1 = mullo(a1, -26307); + + /* invalid inputs might need reduction mod 1557 */ + a1 -= 1557; + a1 += (a1 >> 15) & 1557; + + R8[2 * i] = a0; + R8[2 * i + 1] = a1; + } + + /* R8 ------> R7: reconstruct mod 5*[10101]+[282] */ + + i = 0; + s -= 1; + a2 = a0 = R8[2]; + a0 = mulhi(a0, -545) - mulhi(mullo(a0, -1661), 10101); /* -5187...5050 */ + a0 += s[1 * i + 0]; /* -5187...5305 */ + a0 += (a0 >> 15) & 10101; /* 0...10100 */ + a1 = (int16) ((a2 << 8) + s[i] - a0); + a1 = mullo(a1, 12509); + + /* invalid inputs might need reduction mod 282 */ + a1 -= 282; + a1 += (a1 >> 15) & 282; + + R7[4] = a0; + R7[5] = a1; + s -= 4; + for (i = 1; i >= 0; --i) { + a0 = R8[i]; + a0 = mulhi(a0, -545) - mulhi(mullo(a0, -1661), 10101); /* -5187...5050 */ + a0 += s[2 * i + 1]; /* -5187...5305 */ + a0 = mulhi(a0, -545) - mulhi(mullo(a0, -1661), 10101); /* -5095...5093 */ + a0 += s[2 * i + 0]; /* -5095...5348 */ + a0 += (a0 >> 15) & 10101; /* 0...10100 */ + a1 = (int16) ((s[2 * i + 1] << 8) + s[2 * i] - a0); + a1 = mullo(a1, 12509); + + /* invalid inputs might need reduction mod 10101 */ + a1 -= 10101; + a1 += (a1 >> 15) & 10101; + + R7[2 * i] = a0; + R7[2 * i + 1] = a1; + } + + /* R7 ------> R6: reconstruct mod 11*[1608]+[11468] */ + + i = 0; + s -= 2; + a2 = a0 = R7[5]; + a0 = mulhi(a0, -656) - mulhi(mullo(a0, -10434), 1608); /* -968...804 */ + a0 += s[2 * i + 1]; /* -968...1059 */ + a0 = mulhi(a0, -656) - mulhi(mullo(a0, -10434), 1608); /* -815...813 */ + a0 += s[2 * i + 0]; /* -815...1068 */ + a0 += (a0 >> 15) & 1608; /* 0...1607 */ + a1 = (int16) ((a2 << 13) + (s[2 * i + 1] << 5) + ((s[2 * i] - a0) >> 3)); + a1 = mullo(a1, 6521); + + /* invalid inputs might need reduction mod 11468 */ + a1 -= 11468; + a1 += (a1 >> 15) & 11468; + + R6[10] = a0; + R6[11] = a1; + s -= 5; + for (i = 4; i >= 0; --i) { + a2 = a0 = R7[i]; + a0 = mulhi(a0, -656) - mulhi(mullo(a0, -10434), 1608); /* -968...804 */ + a0 += s[1 * i + 0]; /* -968...1059 */ + a0 += (a0 >> 15) & 1608; /* 0...1607 */ + a1 = (int16) ((a2 << 5) + ((s[i] - a0) >> 3)); + a1 = mullo(a1, 6521); + + /* invalid inputs might need reduction mod 1608 */ + a1 -= 1608; + a1 += (a1 >> 15) & 1608; + + R6[2 * i] = a0; + R6[2 * i + 1] = a1; + } + + /* R6 ------> R5: reconstruct mod 23*[10265]+[286] */ + + i = 0; + s -= 1; + a2 = a0 = R6[11]; + a0 = mulhi(a0, 4206) - mulhi(mullo(a0, -1634), 10265); /* -5133...6184 */ + a0 += s[1 * i + 0]; /* -5133...6439 */ + a0 += (a0 >> 15) & 10265; /* 0...10264 */ + a1 = (int16) ((a2 << 8) + s[i] - a0); + a1 = mullo(a1, -19415); + + /* invalid inputs might need reduction mod 286 */ + a1 -= 286; + a1 += (a1 >> 15) & 286; + + R5[22] = a0; + R5[23] = a1; + s -= 22; + for (i = 10; i >= 0; --i) { + a0 = R6[i]; + a0 = mulhi(a0, 4206) - mulhi(mullo(a0, -1634), 10265); /* -5133...6184 */ + a0 += s[2 * i + 1]; /* -5133...6439 */ + a0 = mulhi(a0, 4206) - mulhi(mullo(a0, -1634), 10265); /* -5462...5545 */ + a0 += s[2 * i + 0]; /* -5462...5800 */ + a0 += (a0 >> 15) & 10265; /* 0...10264 */ + a1 = (int16) ((s[2 * i + 1] << 8) + s[2 * i] - a0); + a1 = mullo(a1, -19415); + + /* invalid inputs might need reduction mod 10265 */ + a1 -= 10265; + a1 += (a1 >> 15) & 10265; + + R5[2 * i] = a0; + R5[2 * i + 1] = a1; + } + + /* R5 ------> R4: reconstruct mod 47*[1621]+[11550] */ + + i = 0; + s -= 2; + a0 = R5[23]; + a0 = mulhi(a0, -134) - mulhi(mullo(a0, -10350), 1621); /* -844...810 */ + a0 += s[2 * i + 1]; /* -844...1065 */ + a0 = mulhi(a0, -134) - mulhi(mullo(a0, -10350), 1621); /* -813...812 */ + a0 += s[2 * i + 0]; /* -813...1067 */ + a0 += (a0 >> 15) & 1621; /* 0...1620 */ + a1 = (int16) ((s[2 * i + 1] << 8) + s[2 * i] - a0); + a1 = mullo(a1, -14595); + + /* invalid inputs might need reduction mod 11550 */ + a1 -= 11550; + a1 += (a1 >> 15) & 11550; + + R4[46] = a0; + R4[47] = a1; + s -= 23; + i = 7; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R5[i]); + S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); + A0 = sub(mulhiconst(A0, -134), mulhiconst(mulloconst(A0, -10350), 1621)); /* -844...810 */ + A0 = add(A0, S0); /* -844...1065 */ + A0 = ifnegaddconst(A0, 1621); /* 0...1620 */ + A1 = add(shiftleftconst(A2, 8), sub(S0, A0)); + A1 = mulloconst(A1, -14595); + + /* invalid inputs might need reduction mod 1621 */ + A1 = ifgesubconst(A1, 1621); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R4[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R4[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R4 ------> R3: reconstruct mod 95*[644]+[4591] */ + + i = 0; + s -= 1; + a2 = a0 = R4[47]; + a0 = mulhi(a0, -272) - mulhi(mullo(a0, -26052), 644); /* -390...322 */ + a0 += s[1 * i + 0]; /* -390...577 */ + a0 += (a0 >> 15) & 644; /* 0...643 */ + a1 = (int16) ((a2 << 6) + ((s[i] - a0) >> 2)); + a1 = mullo(a1, -7327); + + /* invalid inputs might need reduction mod 4591 */ + a1 -= 4591; + a1 += (a1 >> 15) & 4591; + + R3[94] = a0; + R3[95] = a1; + s -= 47; + i = 31; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R4[i]); + S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); + A0 = sub(mulhiconst(A0, -272), mulhiconst(mulloconst(A0, -26052), 644)); /* -390...322 */ + A0 = add(A0, S0); /* -390...577 */ + A0 = ifnegaddconst(A0, 644); /* 0...643 */ + A1 = add(shiftleftconst(A2, 6), signedshiftrightconst(sub(S0, A0), 2)); + A1 = mulloconst(A1, -7327); + + /* invalid inputs might need reduction mod 644 */ + A1 = ifgesubconst(A1, 644); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R3[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R3[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R3 ------> R2: reconstruct mod 190*[406]+[4591] */ + + R2[190] = R3[95]; + s -= 95; + i = 79; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R3[i]); + S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); + A0 = sub(mulhiconst(A0, 78), mulhiconst(mulloconst(A0, 24213), 406)); /* -203...222 */ + A0 = add(A0, S0); /* -203...477 */ + A0 = subconst(A0, 406); /* -609...71 */ + A0 = ifnegaddconst(A0, 406); /* -203...405 */ + A0 = ifnegaddconst(A0, 406); /* 0...405 */ + A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1)); + A1 = mulloconst(A1, 25827); + + /* invalid inputs might need reduction mod 406 */ + A1 = ifgesubconst(A1, 406); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R2[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R2[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R2 ------> R1: reconstruct mod 380*[322]+[4591] */ + + R1[380] = R2[190]; + s -= 190; + i = 174; + for (;;) { + A2 = A0 = _mm256_loadu_si256((__m256i *) &R2[i]); + S0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *) (s + i))); + A0 = sub(mulhiconst(A0, 50), mulhiconst(mulloconst(A0, 13433), 322)); /* -161...173 */ + A0 = add(A0, S0); /* -161...428 */ + A0 = subconst(A0, 322); /* -483...106 */ + A0 = ifnegaddconst(A0, 322); /* -161...321 */ + A0 = ifnegaddconst(A0, 322); /* 0...321 */ + A1 = add(shiftleftconst(A2, 7), signedshiftrightconst(sub(S0, A0), 1)); + A1 = mulloconst(A1, -7327); + + /* invalid inputs might need reduction mod 322 */ + A1 = ifgesubconst(A1, 322); + + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R1[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R1[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } + + /* R1 ------> R0: reconstruct mod 761*[4591] */ + + R0[760] = R1[380] - 2295; + s -= 760; + i = 364; + for (;;) { + A0 = _mm256_loadu_si256((__m256i *) &R1[i]); + S0 = _mm256_loadu_si256((__m256i *) (s + 2 * i)); + S1 = _mm256_srli_epi16(S0, 8); + S0 &= _mm256_set1_epi16(255); + A0 = sub(mulhiconst(A0, 1702), mulhiconst(mulloconst(A0, -3654), 4591)); /* -2296...2721 */ + A0 = add(A0, S1); /* -2296...2976 */ + A0 = sub(mulhiconst(A0, 1702), mulhiconst(mulloconst(A0, -3654), 4591)); /* -2356...2372 */ + A0 = add(A0, S0); /* -2356...2627 */ + A0 = ifnegaddconst(A0, 4591); /* 0...4590 */ + A1 = add(shiftleftconst(S1, 8), sub(S0, A0)); + A1 = mulloconst(A1, 15631); + + /* invalid inputs might need reduction mod 4591 */ + A1 = ifgesubconst(A1, 4591); + + A0 = subconst(A0, 2295); + A1 = subconst(A1, 2295); + /* A0: r0r2r4r6r8r10r12r14 r16r18r20r22r24r26r28r30 */ + /* A1: r1r3r5r7r9r11r13r15 r17r19r21r23r25r27r29r31 */ + B0 = _mm256_unpacklo_epi16(A0, A1); + B1 = _mm256_unpackhi_epi16(A0, A1); + /* B0: r0r1r2r3r4r5r6r7 r16r17r18r19r20r21r22r23 */ + /* B1: r8r9r10r11r12r13r14r15 r24r25r26r27r28r29r30r31 */ + C0 = _mm256_permute2x128_si256(B0, B1, 0x20); + C1 = _mm256_permute2x128_si256(B0, B1, 0x31); + /* C0: r0r1r2r3r4r5r6r7 r8r9r10r11r12r13r14r15 */ + /* C1: r16r17r18r19r20r21r22r23 r24r25r26r27r28r29r30r31 */ + _mm256_storeu_si256((__m256i *) (&R0[2 * i]), C0); + _mm256_storeu_si256((__m256i *) (16 + &R0[2 * i]), C1); + if (!i) { + break; + } + i = -16 - ((~15) & -i); + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.h new file mode 100644 index 0000000000..6b63781438 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761x4591.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X4591_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761X4591_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_STRBYTES 1158 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.c new file mode 100644 index 0000000000..68210a2cb5 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.c @@ -0,0 +1,15 @@ +#include "crypto_decode_761xint16.h" + + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s) { + uint16_t *x = v; + int i; + + for (i = 0; i < 761; ++i) { + uint16_t u0 = s[0]; + uint16_t u1 = s[1]; + *x = (uint16_t) (u0 | (u1 << 8)); + x += 1; + s += 2; + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.h new file mode 100644 index 0000000000..5d8aa8ea17 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint16.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT16_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT16_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16_STRBYTES 1522 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.c new file mode 100644 index 0000000000..fb5210b14c --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.c @@ -0,0 +1,20 @@ +#include "crypto_decode_761xint32.h" + + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32(void *v, const unsigned char *s) { + uint32_t *x = v; + int i; + + for (i = 0; i < 761; ++i) { + uint32_t u0 = s[0]; + uint32_t u1 = s[1]; + uint32_t u2 = s[2]; + uint32_t u3 = s[3]; + u1 <<= 8; + u2 <<= 16; + u3 <<= 24; + *x = u0 | u1 | u2 | u3; + x += 1; + s += 4; + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.h new file mode 100644 index 0000000000..c19b23a2ab --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_761xint32.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT32_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_761XINT32_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32_STRBYTES 3044 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32_ITEMBYTES 4 + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.c new file mode 100644 index 0000000000..3994a7e513 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.c @@ -0,0 +1,9 @@ +#include "crypto_decode_int16.h" + + +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_int16(void *x, const unsigned char *s) { + uint16_t u0 = s[0]; + uint16_t u1 = s[1]; + u1 = (uint16_t) (u1 << 8); + *(uint16_t *) x = u0 | u1; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.h new file mode 100644 index 0000000000..fcc8b07bbe --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_decode_int16.h @@ -0,0 +1,9 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_INT16_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_DECODE_INT16_H + +#include +#define crypto_core_multsntrup1277_STRBYTES 2 +#define crypto_core_multsntrup1277_ITEMS 1 +#define crypto_core_multsntrup1277_ITEMBYTES 2 +void PQCLEAN_SNTRUP761_AVX2_crypto_decode_int16(void *x, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.c new file mode 100644 index 0000000000..c293a1b10a --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.c @@ -0,0 +1,301 @@ +#include "crypto_encode_761x1531.h" +#include +/* auto-generated; do not edit */ + +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531(unsigned char *out, const void *v) { + const int16 *R0 = v; + /* XXX: caller could overlap R with input */ + uint16 R[381]; + long i; + const uint16 *reading; + uint16 *writing; + uint16 r0, r1; + uint32 r2; + uint32 s0; + + reading = (uint16 *) R0; + writing = R; + i = 48; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 8; + writing -= 4; + out -= 4; + } + x = _mm256_loadu_si256((__m256i *) reading); + x = _mm256_add_epi16(x, _mm256_set1_epi16(2295)); + x &= _mm256_set1_epi16(16383); + x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846)); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1531)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + R[380] = (uint16) ((((R0[760] + 2295) & 16383) * 10923) >> 15); + + reading = (uint16 *) R; + writing = R; + i = 12; + while (i > 0) { + __m256i x, x2, y, y2; + --i; + if (!i) { + reading -= 4; + writing -= 2; + out -= 4; + } + x = _mm256_loadu_si256((__m256i *) (reading + 0)); + x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); + y = x & _mm256_set1_epi32(65535); + y2 = x2 & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x2 = _mm256_srli_epi32(x2, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9157)); + x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9157)); + x = _mm256_add_epi32(y, x); + x2 = _mm256_add_epi32(y2, x2); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + x2 = _mm256_permute4x64_epi64(x2, 0xd8); + _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); + _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); + reading += 32; + writing += 16; + out += 32; + } + R[190] = R[380]; + + reading = (uint16 *) R; + writing = R; + i = 12; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 2; + writing -= 1; + out -= 1; + } + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1280)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + R[95] = R[190]; + + reading = (uint16 *) R; + writing = R; + i = 3; + while (i > 0) { + __m256i x, x2, y, y2; + --i; + x = _mm256_loadu_si256((__m256i *) (reading + 0)); + x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); + y = x & _mm256_set1_epi32(65535); + y2 = x2 & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x2 = _mm256_srli_epi32(x2, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6400)); + x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6400)); + x = _mm256_add_epi32(y, x); + x2 = _mm256_add_epi32(y2, x2); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + x2 = _mm256_permute4x64_epi64(x2, 0xd8); + _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); + _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); + reading += 32; + writing += 16; + out += 32; + } + + reading = (uint16 *) R; + writing = R; + i = 3; + while (i > 0) { + __m256i x, y; + --i; + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(625)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + + reading = (uint16 *) R; + writing = R; + i = 2; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 8; + writing -= 4; + out -= 4; + } + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1526)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + + for (i = 0; i < 6; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)9097; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + for (i = 0; i < 3; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1263; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)6232; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + R[1] = R[2]; + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)593; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + + r0 = R[0]; + *out++ = (unsigned char) r0; + r0 >>= 8; + *out++ = (unsigned char) r0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.h new file mode 100644 index 0000000000..c4a2a75373 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531_STRBYTES 1007 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531(unsigned char *out, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.c new file mode 100644 index 0000000000..7aa27f4208 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.c @@ -0,0 +1,303 @@ +#include "crypto_encode_761x1531round.h" +#include +/* auto-generated; do not edit */ + +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round(unsigned char *out, const void *v) { + const int16 *R0 = v; + /* XXX: caller could overlap R with input */ + uint16 R[381]; + long i; + const uint16 *reading; + uint16 *writing; + uint16 r0, r1; + uint32 r2; + uint32 s0; + + reading = (uint16 *) R0; + writing = R; + i = 48; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 8; + writing -= 4; + out -= 4; + } + x = _mm256_loadu_si256((__m256i *) reading); + x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923)); + x = _mm256_add_epi16(x, _mm256_add_epi16(x, x)); + x = _mm256_add_epi16(x, _mm256_set1_epi16(2295)); + x &= _mm256_set1_epi16(16383); + x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846)); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1531)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + R[380] = (uint16) ((((3 * ((10923 * R0[760] + 16384) >> 15) + 2295) & 16383) * 10923) >> 15); + + reading = (uint16 *) R; + writing = R; + i = 12; + while (i > 0) { + __m256i x, x2, y, y2; + --i; + if (!i) { + reading -= 4; + writing -= 2; + out -= 4; + } + x = _mm256_loadu_si256((__m256i *) (reading + 0)); + x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); + y = x & _mm256_set1_epi32(65535); + y2 = x2 & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x2 = _mm256_srli_epi32(x2, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(9157)); + x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(9157)); + x = _mm256_add_epi32(y, x); + x2 = _mm256_add_epi32(y2, x2); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + x2 = _mm256_permute4x64_epi64(x2, 0xd8); + _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); + _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); + reading += 32; + writing += 16; + out += 32; + } + R[190] = R[380]; + + reading = (uint16 *) R; + writing = R; + i = 12; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 2; + writing -= 1; + out -= 1; + } + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1280)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + R[95] = R[190]; + + reading = (uint16 *) R; + writing = R; + i = 3; + while (i > 0) { + __m256i x, x2, y, y2; + --i; + x = _mm256_loadu_si256((__m256i *) (reading + 0)); + x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); + y = x & _mm256_set1_epi32(65535); + y2 = x2 & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x2 = _mm256_srli_epi32(x2, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(6400)); + x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(6400)); + x = _mm256_add_epi32(y, x); + x2 = _mm256_add_epi32(y2, x2); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + x2 = _mm256_permute4x64_epi64(x2, 0xd8); + _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); + _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); + reading += 32; + writing += 16; + out += 32; + } + + reading = (uint16 *) R; + writing = R; + i = 3; + while (i > 0) { + __m256i x, y; + --i; + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(625)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + + reading = (uint16 *) R; + writing = R; + i = 2; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 8; + writing -= 4; + out -= 4; + } + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1526)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + + for (i = 0; i < 6; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)9097; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + for (i = 0; i < 3; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1263; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)6232; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + R[1] = R[2]; + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)593; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + + r0 = R[0]; + *out++ = (unsigned char) r0; + r0 >>= 8; + *out++ = (unsigned char) r0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.h new file mode 100644 index 0000000000..b3c29ef9eb --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x1531round.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531ROUND_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X1531ROUND_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round_STRBYTES 1007 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round(unsigned char *out, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.c new file mode 100644 index 0000000000..d7442199b5 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.c @@ -0,0 +1,64 @@ +#include "crypto_encode_761x3.h" +#include +#define uint8 uint8_t + +#define p 761 +#define loops 6 +#define overshoot 2 + +static const union { + uint8 init[32]; + __m256i val; +} lobytes_buf = { .init = { + 255, 0, 255, 0, 255, 0, 255, 0, + 255, 0, 255, 0, 255, 0, 255, 0, + 255, 0, 255, 0, 255, 0, 255, 0, + 255, 0, 255, 0, 255, 0, 255, 0, + } +}; +#define lobytes (lobytes_buf.val) + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3(unsigned char *s, const void *v) { + const uint8 *f = v; + int loop; + const uint8 *nextf = f + 128 - 4 * overshoot; + unsigned char *nexts = s + 32 - overshoot; + + for (loop = loops; loop > 0; --loop) { + __m256i f0 = _mm256_loadu_si256((const __m256i *) (f + 0)); + __m256i f1 = _mm256_loadu_si256((const __m256i *) (f + 32)); + __m256i f2 = _mm256_loadu_si256((const __m256i *) (f + 64)); + __m256i f3 = _mm256_loadu_si256((const __m256i *) (f + 96)); + f = nextf; + nextf += 128; + + __m256i a0 = _mm256_packus_epi16(f0 & lobytes, f1 & lobytes); + /* 0 2 4 6 8 10 12 14 32 34 36 38 40 42 44 46 */ + /* 16 18 20 22 24 26 28 30 48 50 52 54 56 58 60 62 */ + __m256i a1 = _mm256_packus_epi16(_mm256_srli_epi16(f0, 8), _mm256_srli_epi16(f1, 8)); + /* 1 3 ... */ + __m256i a2 = _mm256_packus_epi16(f2 & lobytes, f3 & lobytes); + __m256i a3 = _mm256_packus_epi16(_mm256_srli_epi16(f2, 8), _mm256_srli_epi16(f3, 8)); + + a0 = _mm256_add_epi8(a0, _mm256_slli_epi16(a1 & _mm256_set1_epi8(63), 2)); + a2 = _mm256_add_epi8(a2, _mm256_slli_epi16(a3 & _mm256_set1_epi8(63), 2)); + + __m256i b0 = _mm256_packus_epi16(a0 & lobytes, a2 & lobytes); + /* 0 4 8 12 32 36 40 44 64 68 72 76 96 100 104 108 */ + /* 16 20 24 28 48 52 56 60 80 84 88 92 112 116 120 124 */ + __m256i b2 = _mm256_packus_epi16(_mm256_srli_epi16(a0, 8), _mm256_srli_epi16(a2, 8)); + /* 2 6 ... */ + + b0 = _mm256_add_epi8(b0, _mm256_slli_epi16(b2 & _mm256_set1_epi8(15), 4)); + + b0 = _mm256_permutevar8x32_epi32(b0, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + + b0 = _mm256_add_epi8(b0, _mm256_set1_epi8(85)); + + _mm256_storeu_si256((__m256i *) s, b0); + s = nexts; + nexts += 32; + } + + *s++ = *f++ + 1; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.h new file mode 100644 index 0000000000..e99384fbf4 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x3.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X3_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X3_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_STRBYTES 191 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_ITEMBYTES 1 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3(unsigned char *s, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.c new file mode 100644 index 0000000000..128cd4bbcf --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.c @@ -0,0 +1,308 @@ +#include "crypto_encode_761x4591.h" +#include +/* auto-generated; do not edit */ + +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591(unsigned char *out, const void *v) { + const int16 *R0 = v; + /* XXX: caller could overlap R with input */ + uint16 R[381]; + long i; + const uint16 *reading; + uint16 *writing; + uint16 r0, r1; + uint32 r2; + uint32 s0; + + reading = (uint16 *) R0; + writing = R; + i = 24; + while (i > 0) { + __m256i x, x2, y, y2; + --i; + if (!i) { + reading -= 8; + writing -= 4; + out -= 8; + } + x = _mm256_loadu_si256((__m256i *) (reading + 0)); + x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); + x = _mm256_add_epi16(x, _mm256_set1_epi16(2295)); + x2 = _mm256_add_epi16(x2, _mm256_set1_epi16(2295)); + x &= _mm256_set1_epi16(16383); + x2 &= _mm256_set1_epi16(16383); + y = x & _mm256_set1_epi32(65535); + y2 = x2 & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x2 = _mm256_srli_epi32(x2, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(4591)); + x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(4591)); + x = _mm256_add_epi32(y, x); + x2 = _mm256_add_epi32(y2, x2); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + x2 = _mm256_permute4x64_epi64(x2, 0xd8); + _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); + _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); + reading += 32; + writing += 16; + out += 32; + } + R[380] = (uint16) (((R0[760] + 2295) & 16383)); + + reading = (uint16 *) R; + writing = R; + i = 24; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 4; + writing -= 2; + out -= 2; + } + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(322)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + R[190] = R[380]; + + reading = (uint16 *) R; + writing = R; + i = 12; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 2; + writing -= 1; + out -= 1; + } + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(406)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + R[95] = R[190]; + + reading = (uint16 *) R; + writing = R; + i = 6; + while (i > 0) { + __m256i x, y; + --i; + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(644)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + + reading = (uint16 *) R; + writing = R; + i = 3; + while (i > 0) { + __m256i x, y; + --i; + if (!i) { + reading -= 2; + writing -= 1; + out -= 1; + } + x = _mm256_loadu_si256((__m256i *) reading); + y = x & _mm256_set1_epi32(65535); + x = _mm256_srli_epi32(x, 16); + x = _mm256_mullo_epi32(x, _mm256_set1_epi32(1621)); + x = _mm256_add_epi32(y, x); + x = _mm256_shuffle_epi8(x, _mm256_set_epi8( + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, + 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 + )); + x = _mm256_permute4x64_epi64(x, 0xd8); + _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); + s0 = (uint32) _mm256_extract_epi32(x, 4); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 = (uint32) _mm256_extract_epi32(x, 6); + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + s0 >>= 8; + *out++ = (unsigned char) s0; + reading += 16; + writing += 8; + } + r0 = R[46]; + r1 = R[47]; + r2 = r0 + r1 * (uint32)1621; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[23] = (uint16) r2; + + for (i = 0; i < 11; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)10265; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + r0 = R[22]; + r1 = R[23]; + r2 = r0 + r1 * (uint32)10265; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[11] = (uint16) r2; + + for (i = 0; i < 5; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1608; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + r0 = R[10]; + r1 = R[11]; + r2 = r0 + r1 * (uint32)1608; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[5] = (uint16) r2; + + for (i = 0; i < 2; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)10101; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + r0 = R[4]; + r1 = R[5]; + r2 = r0 + r1 * (uint32)10101; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[2] = (uint16) r2; + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)1557; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + R[1] = R[2]; + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)9470; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + + r0 = R[0]; + *out++ = (unsigned char) r0; + r0 >>= 8; + *out++ = (unsigned char) r0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.h new file mode 100644 index 0000000000..52404cc2e0 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761x4591.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X4591_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761X4591_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_STRBYTES 1158 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591(unsigned char *out, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.c new file mode 100644 index 0000000000..6622e31021 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.c @@ -0,0 +1,31 @@ +#include "crypto_encode_761xfreeze3.h" +#include +#define int16 int16_t + +#define p 761 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3(unsigned char *s, const void *v) { + const int16 *r = v; + + int i = p - 16; + for (;;) { + do { + __m256i x = _mm256_loadu_si256((__m256i *) r); + __m256i y = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923)); + x = _mm256_sub_epi16(x, y); + y = _mm256_add_epi16(y, y); + x = _mm256_sub_epi16(x, y); + __m128i x0 = _mm256_extractf128_si256(x, 0); + __m128i x1 = _mm256_extractf128_si256(x, 1); + _mm_storeu_si128((__m128i *) s, _mm_packs_epi16(x0, x1)); + i -= 16; + r += 16; + s += 16; + } while (i >= 0); + if (i <= -16) { + break; + } + r += i; + s += i; + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.h new file mode 100644 index 0000000000..cc89f9a5e7 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xfreeze3.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XFREEZE3_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XFREEZE3_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3_STRBYTES 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3(unsigned char *s, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.c new file mode 100644 index 0000000000..db839f0867 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.c @@ -0,0 +1,13 @@ +#include "crypto_encode_761xint16.h" + + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16(unsigned char *s, const void *v) { + const uint16_t *x = v; + int i; + + for (i = 0; i < 761; ++i) { + uint16_t u = *x++; + *s++ = (unsigned char) u; + *s++ = (unsigned char) (u >> 8); + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.h new file mode 100644 index 0000000000..6689ad6dac --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_761xint16.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XINT16_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_761XINT16_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16_STRBYTES 1522 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16_ITEMS 761 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16(unsigned char *s, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.c new file mode 100644 index 0000000000..53b9535d89 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.c @@ -0,0 +1,9 @@ +#include "crypto_encode_int16.h" + +#define uint16 uint16_t + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16(unsigned char *s, const void *x) { + uint16 u = *(const uint16 *) x; + s[0] = (unsigned char) u; + s[1] = (unsigned char) (u >> 8); +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.h new file mode 100644 index 0000000000..e8f5a2194d --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_encode_int16.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_INT16_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_ENCODE_INT16_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16_STRBYTES 2 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16_ITEMS 1 +#define PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_AVX2_crypto_encode_int16(unsigned char *s, const void *x); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.c new file mode 100644 index 0000000000..a3268aef0a --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.c @@ -0,0 +1,1215 @@ +#include "crypto_sort_int32.h" +#include + +#define int32 int32_t + +typedef __m256i int32x8; +#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z)) +#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i)) +#define int32x8_min _mm256_min_epi32 +#define int32x8_max _mm256_max_epi32 + +#define int32x8_MINMAX(a,b) \ + do { \ + int32x8 c = int32x8_min((a),(b)); \ + (b) = int32x8_max((a),(b)); \ + (a) = c; \ + } while(0) + +static inline void int32_MINMAX(int32 *a, int32 *b) { + int32 ab = *b ^ *a; + int32 c = (int32)((int64_t) * b - (int64_t) * a); + c ^= ab & (c ^ *b); + c >>= 31; + c &= ab; + *a ^= c; + *b ^= c; +} + +static void minmax_vector(int32 *x, int32 *y, long long n) { + if (n < 8) { + while (n > 0) { + int32_MINMAX(x, y); + ++x; + ++y; + --n; + } + return; + } + if (n & 7) { + int32x8 x0 = int32x8_load(x + n - 8); + int32x8 y0 = int32x8_load(y + n - 8); + int32x8_MINMAX(x0, y0); + int32x8_store(x + n - 8, x0); + int32x8_store(y + n - 8, y0); + n &= ~7; + } + do { + int32x8 x0 = int32x8_load(x); + int32x8 y0 = int32x8_load(y); + int32x8_MINMAX(x0, y0); + int32x8_store(x, x0); + int32x8_store(y, y0); + x += 8; + y += 8; + n -= 8; + } while (n); +} + +/* stages 8,4,2,1 of size-16 bitonic merging */ +static void merge16_finish(int32 *x, int32x8 x0, int32x8 x1, int flagdown) { + int32x8 b0, b1, c0, c1, mask; + + int32x8_MINMAX(x0, x1); + + b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + + int32x8_MINMAX(b0, b1); + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0213B0213 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4657B4657 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* A0246B0246 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* A1357B1357 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A0123B0123 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A4567B4567 */ + + x0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01234567 */ + x1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A01234567 */ + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + } + + int32x8_store(&x[0], x0); + int32x8_store(&x[8], x1); +} + +/* stages 64,32 of bitonic merging; n is multiple of 128 */ +static void int32_twostages_32(int32 *x, long long n) { + long long i; + + while (n > 0) { + for (i = 0; i < 32; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + 32]); + int32x8 x2 = int32x8_load(&x[i + 64]); + int32x8 x3 = int32x8_load(&x[i + 96]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 32], x1); + int32x8_store(&x[i + 64], x2); + int32x8_store(&x[i + 96], x3); + } + x += 128; + n -= 128; + } +} + +/* stages 4q,2q,q of bitonic merging */ +static long long int32_threestages(int32 *x, long long n, long long q) { + long long k, i; + + for (k = 0; k + 8 * q <= n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + + return k; +} + +/* n is a power of 2; n >= 8; if n == 8 then flagdown */ +// NOLINTNEXTLINE(google-readability-function-size) +static void int32_sort_2power(int32 *x, long long n, int flagdown) { + long long p, q, i, j, k; + int32x8 mask; + + if (n == 8) { + int32 x0 = x[0]; + int32 x1 = x[1]; + int32 x2 = x[2]; + int32 x3 = x[3]; + int32 x4 = x[4]; + int32 x5 = x[5]; + int32 x6 = x[6]; + int32 x7 = x[7]; + + /* odd-even sort instead of bitonic sort */ + + int32_MINMAX(&x1, &x0); + int32_MINMAX(&x3, &x2); + int32_MINMAX(&x2, &x0); + int32_MINMAX(&x3, &x1); + int32_MINMAX(&x2, &x1); + + int32_MINMAX(&x5, &x4); + int32_MINMAX(&x7, &x6); + int32_MINMAX(&x6, &x4); + int32_MINMAX(&x7, &x5); + int32_MINMAX(&x6, &x5); + + int32_MINMAX(&x4, &x0); + int32_MINMAX(&x6, &x2); + int32_MINMAX(&x4, &x2); + + int32_MINMAX(&x5, &x1); + int32_MINMAX(&x7, &x3); + int32_MINMAX(&x5, &x3); + + int32_MINMAX(&x2, &x1); + int32_MINMAX(&x4, &x3); + int32_MINMAX(&x6, &x5); + + x[0] = x0; + x[1] = x1; + x[2] = x2; + x[3] = x3; + x[4] = x4; + x[5] = x5; + x[6] = x6; + x[7] = x7; + return; + } + + if (n == 16) { + int32x8 x0, x1, b0, b1, c0, c1; + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + + mask = _mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1); + + x0 ^= mask; /* A01234567 */ + x1 ^= mask; /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + c0 ^= mask; + c1 ^= mask; + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + b0 ^= mask; + b1 ^= mask; + + c0 = _mm256_permute2x128_si256(b0, b1, 0x20); /* A01B01A23B23 */ + c1 = _mm256_permute2x128_si256(b0, b1, 0x31); /* A45B45A67B67 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_permute2x128_si256(c0, c1, 0x20); /* A01B01A45B45 */ + b1 = _mm256_permute2x128_si256(c0, c1, 0x31); /* A23B23A67B67 */ + + int32x8_MINMAX(b0, b1); + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + b0 = _mm256_unpacklo_epi32(x0, x1); /* AB0AB1AB4AB5 */ + b1 = _mm256_unpackhi_epi32(x0, x1); /* AB2AB3AB6AB7 */ + + c0 = _mm256_unpacklo_epi64(b0, b1); /* AB0AB2AB4AB6 */ + c1 = _mm256_unpackhi_epi64(b0, b1); /* AB1AB3AB5AB7 */ + + int32x8_MINMAX(c0, c1); + + b0 = _mm256_unpacklo_epi32(c0, c1); /* A01B01A45B45 */ + b1 = _mm256_unpackhi_epi32(c0, c1); /* A23B23A67B67 */ + + x0 = _mm256_unpacklo_epi64(b0, b1); /* A01234567 */ + x1 = _mm256_unpackhi_epi64(b0, b1); /* B01234567 */ + + mask = _mm256_set1_epi32(-1); + if (flagdown) { + x1 ^= mask; + } else { + x0 ^= mask; + } + + merge16_finish(x, x0, x1, flagdown); + return; + } + + if (n == 32) { + int32x8 x0, x1, x2, x3; + + int32_sort_2power(x, 16, 1); + int32_sort_2power(x + 16, 16, 0); + + x0 = int32x8_load(&x[0]); + x1 = int32x8_load(&x[8]); + x2 = int32x8_load(&x[16]); + x3 = int32x8_load(&x[24]); + + if (flagdown) { + mask = _mm256_set1_epi32(-1); + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + } + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + + merge16_finish(x, x0, x1, flagdown); + merge16_finish(x + 16, x2, x3, flagdown); + return; + } + + p = n >> 3; + for (i = 0; i < p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x2 = int32x8_load(&x[i + 2 * p]); + int32x8 x4 = int32x8_load(&x[i + 4 * p]); + int32x8 x6 = int32x8_load(&x[i + 6 * p]); + + /* odd-even stage instead of bitonic stage */ + + int32x8_MINMAX(x4, x0); + int32x8_MINMAX(x6, x2); + int32x8_MINMAX(x2, x0); + int32x8_MINMAX(x6, x4); + int32x8_MINMAX(x2, x4); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + 2 * p], x2); + int32x8_store(&x[i + 4 * p], x4); + int32x8_store(&x[i + 6 * p], x6); + + int32x8 x1 = int32x8_load(&x[i + p]); + int32x8 x3 = int32x8_load(&x[i + 3 * p]); + int32x8 x5 = int32x8_load(&x[i + 5 * p]); + int32x8 x7 = int32x8_load(&x[i + 7 * p]); + + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x5, x3); + + int32x8_store(&x[i + p], x1); + int32x8_store(&x[i + 3 * p], x3); + int32x8_store(&x[i + 5 * p], x5); + int32x8_store(&x[i + 7 * p], x7); + } + + if (n >= 128) { + int flip, flipflip; + + mask = _mm256_set1_epi32(-1); + + for (j = 0; j < n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 16]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 16], x1); + } + + p = 8; + for (;;) { /* for p in [8, 16, ..., n/16] */ + q = p >> 1; + while (q >= 128) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + if (q == 64) { + int32_twostages_32(x, n); + q = 16; + } + if (q == 32) { + q = 8; + for (k = 0; k < n; k += 8 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q = 4; + } + if (q == 16) { + q = 8; + for (k = 0; k < n; k += 4 * q) { + for (i = k; i < k + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q = 4; + } + if (q == 8) { + for (k = 0; k < n; k += q + q) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + } + } + + q = n >> 3; + flip = 0; + if (p << 1 == q) { + flip = 1; + } + flipflip = 1 - flip; + for (j = 0; j < q; j += p + p) { + for (k = j; k < j + p + p; k += p) { + for (i = k; i < k + p; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + if (flip) { + x0 ^= mask; + x1 ^= mask; + x2 ^= mask; + x3 ^= mask; + x4 ^= mask; + x5 ^= mask; + x6 ^= mask; + x7 ^= mask; + } + + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + flip ^= 1; + } + flip ^= flipflip; + } + + if (p << 4 == n) { + break; + } + p <<= 1; + } + } + + for (p = 4; p >= 1; p >>= 1) { + int32 *z = x; + int32 *target = x + n; + if (p == 4) { + mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8_store(&z[0], x0); + int32x8_store(&z[8], x1); + z += 16; + } + } else if (p == 2) { + mask = _mm256_set_epi32(0, 0, -1, -1, -1, -1, 0, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8_MINMAX(b0, b1); + int32x8 c0 = _mm256_permute2x128_si256(b0, b1, 0x20); + int32x8 c1 = _mm256_permute2x128_si256(b0, b1, 0x31); + int32x8_store(&z[0], c0); + int32x8_store(&z[8], c1); + z += 16; + } + } else { /* p == 1 */ + mask = _mm256_set_epi32(0, -1, -1, 0, 0, -1, -1, 0); + while (z != target) { + int32x8 x0 = int32x8_load(&z[0]); + int32x8 x1 = int32x8_load(&z[8]); + x0 ^= mask; + x1 ^= mask; + int32x8 b0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* A0123B0123 */ + int32x8 b1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* A4567B4567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* A0145B0145 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* A2367B2367 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi64(c0, c1); /* A0123B0123 */ + int32x8 d1 = _mm256_unpackhi_epi64(c0, c1); /* A4567B4567 */ + int32x8_MINMAX(d0, d1); + int32x8 e0 = _mm256_permute2x128_si256(d0, d1, 0x20); + int32x8 e1 = _mm256_permute2x128_si256(d0, d1, 0x31); + int32x8_store(&z[0], e0); + int32x8_store(&z[8], e1); + z += 16; + } + } + + q = n >> 4; + while (q >= 128 || q == 32) { + int32_threestages(x, n, q >> 2); + q >>= 3; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (k = j; k < j + q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += 2 * q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + + int32x8_MINMAX(x0, x1); + + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (k = 0; k < q; k += 8) { + int32x8 x0 = int32x8_load(&x[k]); + int32x8 x1 = int32x8_load(&x[k + q]); + int32x8 x2 = int32x8_load(&x[k + 2 * q]); + int32x8 x3 = int32x8_load(&x[k + 3 * q]); + int32x8 x4 = int32x8_load(&x[k + 4 * q]); + int32x8 x5 = int32x8_load(&x[k + 5 * q]); + int32x8 x6 = int32x8_load(&x[k + 6 * q]); + int32x8 x7 = int32x8_load(&x[k + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8_store(&x[k], x0); + int32x8_store(&x[k + q], x1); + int32x8_store(&x[k + 2 * q], x2); + int32x8_store(&x[k + 3 * q], x3); + int32x8_store(&x[k + 4 * q], x4); + int32x8_store(&x[k + 5 * q], x5); + int32x8_store(&x[k + 6 * q], x6); + int32x8_store(&x[k + 7 * q], x7); + } + } + + /* everything is still masked with _mm256_set_epi32(0,-1,0,-1,0,-1,0,-1); */ + mask = _mm256_set1_epi32(-1); + + for (i = 0; i < n; i += 64) { + int32x8 a0 = int32x8_load(&x[i]); + int32x8 a1 = int32x8_load(&x[i + 8]); + int32x8 a2 = int32x8_load(&x[i + 16]); + int32x8 a3 = int32x8_load(&x[i + 24]); + int32x8 a4 = int32x8_load(&x[i + 32]); + int32x8 a5 = int32x8_load(&x[i + 40]); + int32x8 a6 = int32x8_load(&x[i + 48]); + int32x8 a7 = int32x8_load(&x[i + 56]); + + int32x8 b0 = _mm256_unpacklo_epi32(a0, a1); /* AB0AB1AB4AB5 */ + int32x8 b1 = _mm256_unpackhi_epi32(a0, a1); /* AB2AB3AB6AB7 */ + int32x8 b2 = _mm256_unpacklo_epi32(a2, a3); /* CD0CD1CD4CD5 */ + int32x8 b3 = _mm256_unpackhi_epi32(a2, a3); /* CD2CD3CD6CD7 */ + int32x8 b4 = _mm256_unpacklo_epi32(a4, a5); /* EF0EF1EF4EF5 */ + int32x8 b5 = _mm256_unpackhi_epi32(a4, a5); /* EF2EF3EF6EF7 */ + int32x8 b6 = _mm256_unpacklo_epi32(a6, a7); /* GH0GH1GH4GH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(a6, a7); /* GH2GH3GH6GH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b2); /* ABCD0ABCD4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b3); /* ABCD2ABCD6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b2); /* ABCD1ABCD5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b3); /* ABCD3ABCD7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b4, b6); /* EFGH0EFGH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b5, b7); /* EFGH2EFGH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b4, b6); /* EFGH1EFGH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b5, b7); /* EFGH3EFGH7 */ + + if (flagdown) { + c2 ^= mask; + c3 ^= mask; + c6 ^= mask; + c7 ^= mask; + } else { + c0 ^= mask; + c1 ^= mask; + c4 ^= mask; + c5 ^= mask; + } + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* ABCDEFGH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c2, c6, 0x20); /* ABCDEFGH1 */ + int32x8 d2 = _mm256_permute2x128_si256(c1, c5, 0x20); /* ABCDEFGH2 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* ABCDEFGH5 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* ABCDEFGH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c2, c6, 0x31); /* ABCDEFGH3 */ + int32x8 d6 = _mm256_permute2x128_si256(c1, c5, 0x31); /* ABCDEFGH6 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* ABCDEFGH7 */ + + int32x8_MINMAX(d0, d1); + int32x8_MINMAX(d2, d3); + int32x8_MINMAX(d4, d5); + int32x8_MINMAX(d6, d7); + int32x8_MINMAX(d0, d2); + int32x8_MINMAX(d1, d3); + int32x8_MINMAX(d4, d6); + int32x8_MINMAX(d5, d7); + int32x8_MINMAX(d0, d4); + int32x8_MINMAX(d1, d5); + int32x8_MINMAX(d2, d6); + int32x8_MINMAX(d3, d7); + + int32x8 e0 = _mm256_unpacklo_epi32(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi32(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi32(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi32(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi32(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi32(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi32(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi32(d6, d7); + + int32x8 f0 = _mm256_unpacklo_epi64(e0, e2); + int32x8 f1 = _mm256_unpacklo_epi64(e1, e3); + int32x8 f2 = _mm256_unpackhi_epi64(e0, e2); + int32x8 f3 = _mm256_unpackhi_epi64(e1, e3); + int32x8 f4 = _mm256_unpacklo_epi64(e4, e6); + int32x8 f5 = _mm256_unpacklo_epi64(e5, e7); + int32x8 f6 = _mm256_unpackhi_epi64(e4, e6); + int32x8 f7 = _mm256_unpackhi_epi64(e5, e7); + + int32x8 g0 = _mm256_permute2x128_si256(f0, f4, 0x20); + int32x8 g1 = _mm256_permute2x128_si256(f2, f6, 0x20); + int32x8 g2 = _mm256_permute2x128_si256(f1, f5, 0x20); + int32x8 g3 = _mm256_permute2x128_si256(f3, f7, 0x20); + int32x8 g4 = _mm256_permute2x128_si256(f0, f4, 0x31); + int32x8 g5 = _mm256_permute2x128_si256(f2, f6, 0x31); + int32x8 g6 = _mm256_permute2x128_si256(f1, f5, 0x31); + int32x8 g7 = _mm256_permute2x128_si256(f3, f7, 0x31); + + int32x8_store(&x[i], g0); + int32x8_store(&x[i + 8], g1); + int32x8_store(&x[i + 16], g2); + int32x8_store(&x[i + 24], g3); + int32x8_store(&x[i + 32], g4); + int32x8_store(&x[i + 40], g5); + int32x8_store(&x[i + 48], g6); + int32x8_store(&x[i + 56], g7); + } + + q = n >> 4; + while (q >= 128 || q == 32) { + q >>= 2; + for (j = 0; j < n; j += 8 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + int32x8_store(&x[i + 4 * q], x4); + int32x8_store(&x[i + 5 * q], x5); + int32x8_store(&x[i + 6 * q], x6); + int32x8_store(&x[i + 7 * q], x7); + } + } + q >>= 1; + } + while (q >= 16) { + q >>= 1; + for (j = 0; j < n; j += 4 * q) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + } + q >>= 1; + } + if (q == 8) { + for (j = 0; j < n; j += q + q) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + q], x1); + } + } + + q = n >> 3; + for (i = 0; i < q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8 x4 = int32x8_load(&x[i + 4 * q]); + int32x8 x5 = int32x8_load(&x[i + 5 * q]); + int32x8 x6 = int32x8_load(&x[i + 6 * q]); + int32x8 x7 = int32x8_load(&x[i + 7 * q]); + + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + + int32x8 b0 = _mm256_unpacklo_epi32(x0, x4); /* AE0AE1AE4AE5 */ + int32x8 b1 = _mm256_unpackhi_epi32(x0, x4); /* AE2AE3AE6AE7 */ + int32x8 b2 = _mm256_unpacklo_epi32(x1, x5); /* BF0BF1BF4BF5 */ + int32x8 b3 = _mm256_unpackhi_epi32(x1, x5); /* BF2BF3BF6BF7 */ + int32x8 b4 = _mm256_unpacklo_epi32(x2, x6); /* CG0CG1CG4CG5 */ + int32x8 b5 = _mm256_unpackhi_epi32(x2, x6); /* CG2CG3CG6CG7 */ + int32x8 b6 = _mm256_unpacklo_epi32(x3, x7); /* DH0DH1DH4DH5 */ + int32x8 b7 = _mm256_unpackhi_epi32(x3, x7); /* DH2DH3DH6DH7 */ + + int32x8 c0 = _mm256_unpacklo_epi64(b0, b4); /* AECG0AECG4 */ + int32x8 c1 = _mm256_unpacklo_epi64(b1, b5); /* AECG2AECG6 */ + int32x8 c2 = _mm256_unpackhi_epi64(b0, b4); /* AECG1AECG5 */ + int32x8 c3 = _mm256_unpackhi_epi64(b1, b5); /* AECG3AECG7 */ + int32x8 c4 = _mm256_unpacklo_epi64(b2, b6); /* BFDH0BFDH4 */ + int32x8 c5 = _mm256_unpacklo_epi64(b3, b7); /* BFDH2BFDH6 */ + int32x8 c6 = _mm256_unpackhi_epi64(b2, b6); /* BFDH1BFDH5 */ + int32x8 c7 = _mm256_unpackhi_epi64(b3, b7); /* BFDH3BFDH7 */ + + int32x8 d0 = _mm256_permute2x128_si256(c0, c4, 0x20); /* AECGBFDH0 */ + int32x8 d1 = _mm256_permute2x128_si256(c1, c5, 0x20); /* AECGBFDH2 */ + int32x8 d2 = _mm256_permute2x128_si256(c2, c6, 0x20); /* AECGBFDH1 */ + int32x8 d3 = _mm256_permute2x128_si256(c3, c7, 0x20); /* AECGBFDH3 */ + int32x8 d4 = _mm256_permute2x128_si256(c0, c4, 0x31); /* AECGBFDH4 */ + int32x8 d5 = _mm256_permute2x128_si256(c1, c5, 0x31); /* AECGBFDH6 */ + int32x8 d6 = _mm256_permute2x128_si256(c2, c6, 0x31); /* AECGBFDH5 */ + int32x8 d7 = _mm256_permute2x128_si256(c3, c7, 0x31); /* AECGBFDH7 */ + + if (flagdown) { + d0 ^= mask; + d1 ^= mask; + d2 ^= mask; + d3 ^= mask; + d4 ^= mask; + d5 ^= mask; + d6 ^= mask; + d7 ^= mask; + } + + int32x8_store(&x[i], d0); + int32x8_store(&x[i + q], d4); + int32x8_store(&x[i + 2 * q], d1); + int32x8_store(&x[i + 3 * q], d5); + int32x8_store(&x[i + 4 * q], d2); + int32x8_store(&x[i + 5 * q], d6); + int32x8_store(&x[i + 6 * q], d3); + int32x8_store(&x[i + 7 * q], d7); + } +} + +static void int32_sort(int32 *x, long long n) { + long long q, i, j; + + if (n <= 8) { + if (n == 8) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + int32_MINMAX(&x[6], &x[7]); + } + if (n >= 7) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + int32_MINMAX(&x[5], &x[6]); + } + if (n >= 6) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + int32_MINMAX(&x[4], &x[5]); + } + if (n >= 5) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + int32_MINMAX(&x[3], &x[4]); + } + if (n >= 4) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + int32_MINMAX(&x[2], &x[3]); + } + if (n >= 3) { + int32_MINMAX(&x[0], &x[1]); + int32_MINMAX(&x[1], &x[2]); + } + if (n >= 2) { + int32_MINMAX(&x[0], &x[1]); + } + return; + } + + if (!(n & (n - 1))) { + int32_sort_2power(x, n, 0); + return; + } + + q = 8; + while (q < n - q) { + q += q; + } + /* n > q >= 8 */ + + if (q <= 128) { /* n <= 256 */ + int32x8 y[32]; + for (i = q >> 3; i < q >> 2; ++i) { + y[i] = _mm256_set1_epi32(0x7fffffff); + } + for (i = 0; i < n; ++i) { + ((int32 *) y)[i] = x[i]; + } + int32_sort_2power((int32 *) y, 2 * q, 0); + for (i = 0; i < n; ++i) { + x[i] = ((int32 *) y)[i]; + } + return; + } + + int32_sort_2power(x, q, 1); + int32_sort(x + q, n - q); + + while (q >= 64) { + q >>= 2; + j = int32_threestages(x, n, q); + minmax_vector(x + j, x + j + 4 * q, n - 4 * q - j); + if (j + 4 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8 x2 = int32x8_load(&x[i + 2 * q]); + int32x8 x3 = int32x8_load(&x[i + 3 * q]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + int32x8_store(&x[i + 2 * q], x2); + int32x8_store(&x[i + 3 * q], x3); + } + j += 4 * q; + } + minmax_vector(x + j, x + j + 2 * q, n - 2 * q - j); + if (j + 2 * q <= n) { + for (i = j; i < j + q; i += 8) { + int32x8 x0 = int32x8_load(&x[i]); + int32x8 x1 = int32x8_load(&x[i + q]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[i], x0); + int32x8_store(&x[i + q], x1); + } + j += 2 * q; + } + minmax_vector(x + j, x + j + q, n - q - j); + q >>= 1; + } + if (q == 32) { + j = 0; + for (; j + 64 <= n; j += 64) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8 x4 = int32x8_load(&x[j + 32]); + int32x8 x5 = int32x8_load(&x[j + 40]); + int32x8 x6 = int32x8_load(&x[j + 48]); + int32x8 x7 = int32x8_load(&x[j + 56]); + int32x8_MINMAX(x0, x4); + int32x8_MINMAX(x1, x5); + int32x8_MINMAX(x2, x6); + int32x8_MINMAX(x3, x7); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x4, x6); + int32x8_MINMAX(x5, x7); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8_MINMAX(x4, x5); + int32x8_MINMAX(x6, x7); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8 a4 = _mm256_permute2x128_si256(x4, x5, 0x20); + int32x8 a5 = _mm256_permute2x128_si256(x4, x5, 0x31); + int32x8 a6 = _mm256_permute2x128_si256(x6, x7, 0x20); + int32x8 a7 = _mm256_permute2x128_si256(x6, x7, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8_MINMAX(a4, a5); + int32x8_MINMAX(a6, a7); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 b4 = _mm256_permute2x128_si256(a4, a5, 0x20); + int32x8 b5 = _mm256_permute2x128_si256(a4, a5, 0x31); + int32x8 b6 = _mm256_permute2x128_si256(a6, a7, 0x20); + int32x8 b7 = _mm256_permute2x128_si256(a6, a7, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8 c4 = _mm256_unpacklo_epi64(b4, b5); + int32x8 c5 = _mm256_unpackhi_epi64(b4, b5); + int32x8 c6 = _mm256_unpacklo_epi64(b6, b7); + int32x8 c7 = _mm256_unpackhi_epi64(b6, b7); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8_MINMAX(c4, c5); + int32x8_MINMAX(c6, c7); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 d4 = _mm256_unpacklo_epi32(c4, c5); + int32x8 d5 = _mm256_unpackhi_epi32(c4, c5); + int32x8 d6 = _mm256_unpacklo_epi32(c6, c7); + int32x8 d7 = _mm256_unpackhi_epi32(c6, c7); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8 e4 = _mm256_unpacklo_epi64(d4, d5); + int32x8 e5 = _mm256_unpackhi_epi64(d4, d5); + int32x8 e6 = _mm256_unpacklo_epi64(d6, d7); + int32x8 e7 = _mm256_unpackhi_epi64(d6, d7); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8_MINMAX(e4, e5); + int32x8_MINMAX(e6, e7); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8 f4 = _mm256_unpacklo_epi32(e4, e5); + int32x8 f5 = _mm256_unpackhi_epi32(e4, e5); + int32x8 f6 = _mm256_unpacklo_epi32(e6, e7); + int32x8 f7 = _mm256_unpackhi_epi32(e6, e7); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + int32x8_store(&x[j + 32], f4); + int32x8_store(&x[j + 40], f5); + int32x8_store(&x[j + 48], f6); + int32x8_store(&x[j + 56], f7); + } + minmax_vector(x + j, x + j + 32, n - 32 - j); + goto continue16; + } + if (q == 16) { + j = 0; +continue16: + for (; j + 32 <= n; j += 32) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8 x2 = int32x8_load(&x[j + 16]); + int32x8 x3 = int32x8_load(&x[j + 24]); + int32x8_MINMAX(x0, x2); + int32x8_MINMAX(x1, x3); + int32x8_MINMAX(x0, x1); + int32x8_MINMAX(x2, x3); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); + int32x8 a2 = _mm256_permute2x128_si256(x2, x3, 0x20); + int32x8 a3 = _mm256_permute2x128_si256(x2, x3, 0x31); + int32x8_MINMAX(a0, a1); + int32x8_MINMAX(a2, a3); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); + int32x8 b2 = _mm256_permute2x128_si256(a2, a3, 0x20); + int32x8 b3 = _mm256_permute2x128_si256(a2, a3, 0x31); + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); + int32x8 c2 = _mm256_unpacklo_epi64(b2, b3); + int32x8 c3 = _mm256_unpackhi_epi64(b2, b3); + int32x8_MINMAX(c0, c1); + int32x8_MINMAX(c2, c3); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); + int32x8 d2 = _mm256_unpacklo_epi32(c2, c3); + int32x8 d3 = _mm256_unpackhi_epi32(c2, c3); + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); + int32x8 e2 = _mm256_unpacklo_epi64(d2, d3); + int32x8 e3 = _mm256_unpackhi_epi64(d2, d3); + int32x8_MINMAX(e0, e1); + int32x8_MINMAX(e2, e3); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); + int32x8 f2 = _mm256_unpacklo_epi32(e2, e3); + int32x8 f3 = _mm256_unpackhi_epi32(e2, e3); + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + int32x8_store(&x[j + 16], f2); + int32x8_store(&x[j + 24], f3); + } + minmax_vector(x + j, x + j + 16, n - 16 - j); + goto continue8; + } + /* q == 8 */ + j = 0; +continue8: + for (; j + 16 <= n; j += 16) { + int32x8 x0 = int32x8_load(&x[j]); + int32x8 x1 = int32x8_load(&x[j + 8]); + int32x8_MINMAX(x0, x1); + int32x8_store(&x[j], x0); + int32x8_store(&x[j + 8], x1); + int32x8 a0 = _mm256_permute2x128_si256(x0, x1, 0x20); /* x0123y0123 */ + int32x8 a1 = _mm256_permute2x128_si256(x0, x1, 0x31); /* x4567y4567 */ + int32x8_MINMAX(a0, a1); + int32x8 b0 = _mm256_permute2x128_si256(a0, a1, 0x20); /* x01234567 */ + int32x8 b1 = _mm256_permute2x128_si256(a0, a1, 0x31); /* y01234567 */ + int32x8 c0 = _mm256_unpacklo_epi64(b0, b1); /* x01y01x45y45 */ + int32x8 c1 = _mm256_unpackhi_epi64(b0, b1); /* x23y23x67y67 */ + int32x8_MINMAX(c0, c1); + int32x8 d0 = _mm256_unpacklo_epi32(c0, c1); /* x02x13x46x57 */ + int32x8 d1 = _mm256_unpackhi_epi32(c0, c1); /* y02y13y46y57 */ + int32x8 e0 = _mm256_unpacklo_epi64(d0, d1); /* x02y02x46y46 */ + int32x8 e1 = _mm256_unpackhi_epi64(d0, d1); /* x13y13x57y57 */ + int32x8_MINMAX(e0, e1); + int32x8 f0 = _mm256_unpacklo_epi32(e0, e1); /* x01234567 */ + int32x8 f1 = _mm256_unpackhi_epi32(e0, e1); /* y01234567 */ + int32x8_store(&x[j], f0); + int32x8_store(&x[j + 8], f1); + } + minmax_vector(x + j, x + j + 8, n - 8 - j); + if (j + 8 <= n) { + int32_MINMAX(&x[j], &x[j + 4]); + int32_MINMAX(&x[j + 1], &x[j + 5]); + int32_MINMAX(&x[j + 2], &x[j + 6]); + int32_MINMAX(&x[j + 3], &x[j + 7]); + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + int32_MINMAX(&x[j + 4], &x[j + 6]); + int32_MINMAX(&x[j + 5], &x[j + 7]); + int32_MINMAX(&x[j + 4], &x[j + 5]); + int32_MINMAX(&x[j + 6], &x[j + 7]); + j += 8; + } + minmax_vector(x + j, x + j + 4, n - 4 - j); + if (j + 4 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + int32_MINMAX(&x[j + 1], &x[j + 3]); + int32_MINMAX(&x[j], &x[j + 1]); + int32_MINMAX(&x[j + 2], &x[j + 3]); + j += 4; + } + if (j + 3 <= n) { + int32_MINMAX(&x[j], &x[j + 2]); + } + if (j + 2 <= n) { + int32_MINMAX(&x[j], &x[j + 1]); + } +} + +void PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32(void *array, long long n) { + int32_sort(array, n); +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.h new file mode 100644 index 0000000000..2f5329ddff --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_int32.h @@ -0,0 +1,8 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT_INT32_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT_INT32_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32_BYTES 4 + +void PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32(void *array, long long n); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.c new file mode 100644 index 0000000000..e9ee179c04 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.c @@ -0,0 +1,18 @@ +#include "crypto_sort_int32.h" +#include "crypto_sort_uint32.h" + + +/* can save time by vectorizing xor loops */ +/* can save time by integrating xor loops with int32_sort */ + +void PQCLEAN_SNTRUP761_AVX2_crypto_sort_uint32(void *array, long long n) { + uint32_t *x = array; + long long j; + for (j = 0; j < n; ++j) { + x[j] ^= 0x80000000; + } + PQCLEAN_SNTRUP761_AVX2_crypto_sort_int32((int32_t *)array, n); + for (j = 0; j < n; ++j) { + x[j] ^= 0x80000000; + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.h new file mode 100644 index 0000000000..7d22be8432 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_sort_uint32.h @@ -0,0 +1,8 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT_UINT32_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_SORT_UINT32_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_sort_uint32_BYTES 4 + +void PQCLEAN_SNTRUP761_AVX2_crypto_sort_uint32(void *array, long long n); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.c new file mode 100644 index 0000000000..e2bc9f1cbf --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.c @@ -0,0 +1,36 @@ +#include "crypto_verify_1039.h" +#include + +int PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039(const unsigned char *x, const unsigned char *y) { + __m256i diff = _mm256_set1_epi8(0); + unsigned int differentbits = 0; + int i = PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039_BYTES; + + i -= 32; + for (;;) { + do { + __m256i x0 = _mm256_loadu_si256((__m256i *) x); + __m256i y0 = _mm256_loadu_si256((__m256i *) y); + diff |= x0 ^ y0; + i -= 32; + x += 32; + y += 32; + } while (i >= 0); + if (i <= -32) { + break; + } + x += i; + y += i; + } + + diff |= _mm256_srli_epi16(diff, 8); + diff |= _mm256_srli_epi32(diff, 16); + diff |= _mm256_srli_epi64(diff, 32); + + differentbits = (unsigned int) _mm256_extract_epi8(diff, 0); + differentbits |= (unsigned int) _mm256_extract_epi8(diff, 8); + differentbits |= (unsigned int) _mm256_extract_epi8(diff, 16); + differentbits |= (unsigned int) _mm256_extract_epi8(diff, 24); + + return (int) (1 & ((differentbits - 1) >> 8)) - 1; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.h new file mode 100644 index 0000000000..b0d5ffd4f3 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/crypto_verify_1039.h @@ -0,0 +1,8 @@ +#ifndef PQCLEAN_SNTRUP761_AVX2_CRYPTO_VERIFY_1039_H +#define PQCLEAN_SNTRUP761_AVX2_CRYPTO_VERIFY_1039_H + +#include +#define PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039_BYTES 1039 + +int PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039(const unsigned char *x, const unsigned char *y); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/kem.c b/src/kem/ntruprime/pqclean_sntrup761_avx2/kem.c new file mode 100644 index 0000000000..d0d3ffae97 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/kem.c @@ -0,0 +1,251 @@ +#include "api.h" +#include "crypto_declassify.h" +#include "crypto_sort_uint32.h" +#include "params.h" +#include "randombytes.h" +#include "sha2.h" + + + +#define int8 int8_t +#define int16 int16_t +#define int32 int32_t +#define uint16 uint16_t +#define uint32 uint32_t + +/* ----- arithmetic mod 3 */ + +typedef int8 small; +/* F3 is always represented as -1,0,1 */ + +/* ----- arithmetic mod q */ + +typedef int16 Fq; +/* always represented as -(q-1)/2...(q-1)/2 */ + +/* ----- small polynomials */ + +/* R3_fromR(R_fromRq(r)) */ +static void R3_fromRq(small *out, const Fq *r) { + crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r); +} + +/* h = f*g in the ring R3 */ +static void R3_mult(small *h, const small *f, const small *g) { + crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g); +} + +/* ----- polynomials mod q */ + +/* h = h*g in the ring Rq */ +static void Rq_mult_small(Fq *h, const small *g) { + crypto_encode_pxint16((unsigned char *) h, h); + crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g); + crypto_decode_pxint16(h, (const unsigned char *) h); +} + +/* h = 3f in Rq */ +static void Rq_mult3(Fq *h, const Fq *f) { + crypto_encode_pxint16((unsigned char *) h, f); + crypto_core_scale3((unsigned char *) h, (const unsigned char *) h); + crypto_decode_pxint16(h, (const unsigned char *) h); +} + +/* out = 1/(3*in) in Rq */ +/* caller must have 2p+1 bytes free in out, not just 2p */ +static void Rq_recip3(Fq *out, const small *in) { + crypto_core_inv((unsigned char *) out, (const unsigned char *) in); + /* could check byte 2*p for failure; but, in context, inv always works */ + crypto_decode_pxint16(out, (unsigned char *) out); +} + +/* ----- underlying hash function */ + +#define Hash_bytes 32 + +static void Hash(unsigned char *out, const unsigned char *in, int inlen) { + unsigned char h[64]; + int i; + sha512(h, in, (size_t) inlen); + for (i = 0; i < 32; ++i) { + out[i] = h[i]; + } +} + +/* ----- higher-level randomness */ + +static void Short_random(small *out) { + uint32 L[ppadsort]; + int i; + + randombytes((unsigned char *) L, 4 * p); + crypto_decode_pxint32(L, (unsigned char *) L); + for (i = 0; i < w; ++i) { + L[i] = L[i] & (uint32) - 2; + } + for (i = w; i < p; ++i) { + L[i] = (L[i] & (uint32) - 3) | 1; + } + for (i = p; i < ppadsort; ++i) { + L[i] = 0xffffffff; + } + PQCLEAN_SNTRUP761_AVX2_crypto_sort_uint32(L, ppadsort); + for (i = 0; i < p; ++i) { + out[i] = (small) ((L[i] & 3) - 1); + } +} + +static void Small_random(small *out) { + uint32 L[p]; + int i; + + randombytes((unsigned char *) L, sizeof L); + crypto_decode_pxint32(L, (unsigned char *) L); + for (i = 0; i < p; ++i) { + out[i] = (small) ((((L[i] & 0x3fffffff) * 3) >> 30) - 1); + } +} + +/* ----- Streamlined NTRU Prime */ + +typedef small Inputs[p]; /* passed by reference */ +#define Ciphertexts_bytes Rounded_bytes +#define SecretKeys_bytes (2*Small_bytes) +#define PublicKeys_bytes Rq_bytes +#define Confirm_bytes 32 + +/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */ +/* also set r_enc[0]=3 */ +/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */ +/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */ +static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) { + Fq h[p]; + int i; + + Small_encode(r_enc + 1, r); + Rq_decode(h, pk); + Rq_mult_small(h, r); + Round_and_encode(c, h); + r_enc[0] = 3; + Hash(x + 1, r_enc, 1 + Small_bytes); + for (i = 0; i < Hash_bytes; ++i) { + x[1 + Hash_bytes + i] = cache[i]; + } + x[0] = 2; + Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2); +} + + +int PQCLEAN_SNTRUP761_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { + small g[p]; + for (;;) { + Small_random(g); + { + small v[p + 1]; + small vp; + crypto_core_inv3((unsigned char *) v, (const unsigned char *) g); + vp = v[p]; + crypto_declassify(&vp, sizeof vp); + if (vp == 0) { + Small_encode(sk + Small_bytes, v); + break; + } + } + } + { + small f[p]; + Short_random(f); + Small_encode(sk, f); + { + Fq h[p + 1]; + Rq_recip3(h, f); /* always works */ + Rq_mult_small(h, g); + Rq_encode(pk, h); + } + } + { + int i; + unsigned char sksave = sk[SecretKeys_bytes - 1]; + for (i = 0; i < PublicKeys_bytes; ++i) { + sk[SecretKeys_bytes + i] = pk[i]; + } + sk[SecretKeys_bytes - 1] = 4; + Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes); + sk[SecretKeys_bytes - 1] = sksave; + randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes); + } + return 0; +} + +int PQCLEAN_SNTRUP761_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + unsigned char cache[Hash_bytes]; + int i; + { + unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */ + for (i = 0; i < PublicKeys_bytes; ++i) { + y[1 + i] = pk[i]; + } + y[0] = 4; + Hash(cache, y, sizeof y); + } + { + Inputs r; + Short_random(r); + { + unsigned char r_enc[Small_bytes + 1]; + unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes]; + Hide(x, c, r_enc, r, pk, cache); + for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { + x[1 + Hash_bytes + i] = c[i]; + } + x[0] = 1; + Hash(k, x, sizeof x); + } + } + return 0; +} + +int PQCLEAN_SNTRUP761_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { + const unsigned char *pk = sk + SecretKeys_bytes; + const unsigned char *rho = pk + PublicKeys_bytes; + const unsigned char *cache = rho + Small_bytes; + int mask, i; + Inputs r; + { + Fq d[p]; + Rounded_decode(d, c); + { + small f[p]; + Small_decode(f, sk); + Rq_mult_small(d, f); + Rq_mult3(d, d); + } + { + small e[p]; + small v[p]; + R3_fromRq(e, d); + Small_decode(v, sk + Small_bytes); + R3_mult(r, e, v); + } + crypto_core_wforce((unsigned char *) r, (unsigned char *) r); + } + { + unsigned char r_enc[1 + Small_bytes]; + unsigned char cnew[Ciphertexts_bytes + Confirm_bytes]; + unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes]; + /* XXX: can use incremental hashing to reduce x size */ + + Hide(x, cnew, r_enc, r, pk, cache); + mask = crypto_verify_clen(c, cnew); + for (i = 0; i < Small_bytes; ++i) { + r_enc[i + 1] ^= (unsigned char) (mask & (r_enc[i + 1] ^ rho[i])); + } + Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */ + for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { + x[1 + Hash_bytes + i] = c[i]; + } + x[0] = (unsigned char) (1 + mask); + Hash(k, x, sizeof x); + } + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_avx2/params.h b/src/kem/ntruprime/pqclean_sntrup761_avx2/params.h new file mode 100644 index 0000000000..0224bbef70 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_avx2/params.h @@ -0,0 +1,76 @@ +#ifndef params_H +#define params_H +#include "crypto_core_inv3sntrup761.h" +#include "crypto_core_invsntrup761.h" +#include "crypto_core_mult3sntrup761.h" +#include "crypto_core_multsntrup761.h" +#include "crypto_core_scale3sntrup761.h" +#include "crypto_core_weightsntrup761.h" +#include "crypto_core_wforcesntrup761.h" +#include "crypto_decode_761x1531.h" +#include "crypto_decode_761x3.h" +#include "crypto_decode_761x4591.h" +#include "crypto_decode_761xint16.h" +#include "crypto_decode_761xint32.h" +#include "crypto_encode_761x1531.h" +#include "crypto_encode_761x1531round.h" +#include "crypto_encode_761x3.h" +#include "crypto_encode_761x4591.h" +#include "crypto_encode_761xfreeze3.h" +#include "crypto_encode_761xint16.h" +#include "crypto_encode_int16.h" +#include "crypto_sort_int32.h" +#include "crypto_sort_uint32.h" +#include "crypto_verify_1039.h" + + +#define qinv 15631 /* reciprocal of q mod 2^16 */ +#define q31 467759 /* floor(2^31/q) */ +#define q27 29235 /* closest integer to 2^27/q */ +#define q18 57 /* closest integer to 2^18/q */ +#define q14 4 /* closest integer to 2^14/q */ +#define ppad 769 +#define endingmask _mm256_set_epi8(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0) +#define crypto_core_weight PQCLEAN_SNTRUP761_AVX2_crypto_core_weightsntrup761 +#define p 761 +#define q 4591 +#define w 286 + +#define ppadsort 761 + +#define crypto_verify_clen PQCLEAN_SNTRUP761_AVX2_crypto_verify_1039 + +#define Rq_bytes PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591_STRBYTES +#define Rq_encode PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x4591 +#define Rq_decode PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x4591 + +#define Rounded_bytes PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531_STRBYTES +#define Rounded_decode PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x1531 + +#define Round_and_encode PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x1531round + +#define Small_bytes PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3_STRBYTES +#define Small_encode PQCLEAN_SNTRUP761_AVX2_crypto_encode_761x3 +#define Small_decode PQCLEAN_SNTRUP761_AVX2_crypto_decode_761x3 + +#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xfreeze3 + +#define crypto_decode_pxint32 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint32 + +#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_decode_761xint16 + +#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_AVX2_crypto_encode_761xint16 + +#define crypto_core_wforce PQCLEAN_SNTRUP761_AVX2_crypto_core_wforcesntrup761 + +#define crypto_core_scale3 PQCLEAN_SNTRUP761_AVX2_crypto_core_scale3sntrup761 + +#define crypto_core_inv PQCLEAN_SNTRUP761_AVX2_crypto_core_invsntrup761 + +#define crypto_core_inv3 PQCLEAN_SNTRUP761_AVX2_crypto_core_inv3sntrup761 + +#define crypto_core_mult3 PQCLEAN_SNTRUP761_AVX2_crypto_core_mult3sntrup761 + +#define crypto_core_mult PQCLEAN_SNTRUP761_AVX2_crypto_core_multsntrup761 + +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/LICENSE b/src/kem/ntruprime/pqclean_sntrup761_clean/LICENSE new file mode 100644 index 0000000000..d5d21fff6d --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/LICENSE @@ -0,0 +1 @@ +Public Domain diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/api.h b/src/kem/ntruprime/pqclean_sntrup761_clean/api.h new file mode 100644 index 0000000000..e6ec52466f --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/api.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_API_H +#define PQCLEAN_SNTRUP761_CLEAN_API_H + +#include + + +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ALGNAME "sntrup761" + +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SECRETKEYBYTES 1763 +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_PUBLICKEYBYTES 1158 +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CIPHERTEXTBYTES 1039 +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_BYTES 32 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk); +int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.c new file mode 100644 index 0000000000..87f3750a4c --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.c @@ -0,0 +1,110 @@ +#include "crypto_core_inv3sntrup761.h" +#include "params.h" + + + +#define int8 int8_t +#define int16 int16_t +#define int32 int32_t +#define uint16 uint16_t +#define uint32 uint32_t + +/* ----- masks */ + +/* return -1 if x!=0; else return 0 */ +static int int16_nonzero_mask(int16 x) { + uint16 u = (uint16) x; /* 0, else 1...65535 */ + uint32 v = u; /* 0, else 1...65535 */ + v = ~v + 1; /* 0, else 2^32-65535...2^32-1 */ + v >>= 31; /* 0, else 1 */ + return -(int) v; /* 0, else -1 */ +} + +/* return -1 if x<0; otherwise return 0 */ +static int int16_negative_mask(int16 x) { + uint16 u = (uint16) x; + u >>= 15; + return -(int) u; + /* alternative with gcc -fwrapv: */ + /* x>>15 compiles to CPU's arithmetic right shift */ +} + +/* ----- arithmetic mod 3 */ + +typedef int8 small; +/* F3 is always represented as -1,0,1 */ + +/* works for -16384 <= x < 16384 */ +static small F3_freeze(int16 x) { + return (small) (x - 3 * ((10923 * x + 16384) >> 15)); +} + +/* byte p of output is 0 if recip succeeded; else -1 */ +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + small *out = (void *) outbytes; + small *in = (void *) inbytes; + small f[p + 1], g[p + 1], v[p + 1], r[p + 1]; + int i, loop, delta; + int sign, swap, t; + + for (i = 0; i < p + 1; ++i) { + v[i] = 0; + } + for (i = 0; i < p + 1; ++i) { + r[i] = 0; + } + r[0] = 1; + for (i = 0; i < p; ++i) { + f[i] = 0; + } + f[0] = 1; + f[p - 1] = f[p] = -1; + for (i = 0; i < p; ++i) { + small i1 = in[i] & 1; + g[p - 1 - i] = (small) (i1 - (in[i] & (i1 << 1))); + } + g[p] = 0; + + delta = 1; + + for (loop = 0; loop < 2 * p - 1; ++loop) { + for (i = p; i > 0; --i) { + v[i] = v[i - 1]; + } + v[0] = 0; + + sign = -g[0] * f[0]; + swap = int16_negative_mask((int16) - delta) & int16_nonzero_mask(g[0]); + delta ^= swap & (delta ^ -delta); + delta += 1; + + for (i = 0; i < p + 1; ++i) { + t = swap & (f[i] ^ g[i]); + f[i] ^= (small) t; + g[i] ^= (small) t; + t = swap & (v[i] ^ r[i]); + v[i] ^= (small) t; + r[i] ^= (small) t; + } + + for (i = 0; i < p + 1; ++i) { + g[i] = F3_freeze((int16) (g[i] + sign * f[i])); + } + for (i = 0; i < p + 1; ++i) { + r[i] = F3_freeze((int16) (r[i] + sign * v[i])); + } + + for (i = 0; i < p; ++i) { + g[i] = g[i + 1]; + } + g[p] = (int16) 0; + } + + sign = (int) f[0]; + for (i = 0; i < p; ++i) { + out[i] = (small) (sign * v[p - 1 - i]); + } + + out[p] = (small) int16_nonzero_mask((int16) delta); + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.h new file mode 100644 index 0000000000..583e49c0e0 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_inv3sntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INV3SNTRUP761_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INV3SNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_OUTPUTBYTES 762 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.c new file mode 100644 index 0000000000..f9c8a09572 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.c @@ -0,0 +1,132 @@ +#include "crypto_core_invsntrup761.h" +#include "params.h" + + +#define int8 int8_t +#define int16 int16_t +#define int32 int32_t +#define uint16 uint16_t +#define uint32 uint32_t +#define uint64 uint64_t + + +/* ----- masks */ + +/* return -1 if x!=0; else return 0 */ +static int int16_nonzero_mask(int16 x) { + uint16 u = (uint16) x; /* 0, else 1...65535 */ + uint32 v = u; /* 0, else 1...65535 */ + v = ~v + 1; /* 0, else 2^32-65535...2^32-1 */ + v >>= 31; /* 0, else 1 */ + return -(int) v; /* 0, else -1 */ +} + +/* return -1 if x<0; otherwise return 0 */ +static int int16_negative_mask(int16 x) { + uint16 u = (uint16) x; + u >>= 15; + return -(int) u; + /* alternative with gcc -fwrapv: */ + /* x>>15 compiles to CPU's arithmetic right shift */ +} + +/* ----- arithmetic mod q */ + +typedef int8 small; + +typedef int16 Fq; +/* always represented as -(q-1)/2...(q-1)/2 */ + +static Fq Fq_bigfreeze(int32 x) { + x -= q * ((q14 * x) >> 14); + x -= q * ((q18 * x) >> 18); + x -= q * ((q27 * x + 67108864) >> 27); + x -= q * ((q27 * x + 67108864) >> 27); + return (Fq) x; +} + +static Fq Fq_recip(Fq a1) { + int i = 1; + Fq ai = a1; + + while (i < q - 2) { + ai = Fq_bigfreeze(a1 * (int32)ai); + i += 1; + } + return ai; +} + +/* ----- polynomials mod q */ + +/* out = 1/(3*in) in Rq */ +/* outbytes[2*p] is 0 if recip succeeded; else -1 */ +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + small *in = (void *) inbytes; + Fq out[p], f[p + 1], g[p + 1], v[p + 1], r[p + 1]; + int i, loop, delta; + int swap, t; + int32 f0, g0; + Fq scale; + + for (i = 0; i < p + 1; ++i) { + v[i] = 0; + } + for (i = 0; i < p + 1; ++i) { + r[i] = 0; + } + r[0] = Fq_recip(3); + for (i = 0; i < p; ++i) { + f[i] = 0; + } + f[0] = 1; + f[p - 1] = f[p] = -1; + for (i = 0; i < p; ++i) { + g[p - 1 - i] = (Fq) in[i]; + } + g[p] = 0; + + delta = 1; + + for (loop = 0; loop < 2 * p - 1; ++loop) { + for (i = p; i > 0; --i) { + v[i] = v[i - 1]; + } + v[0] = 0; + + swap = int16_negative_mask((int16) - delta) & int16_nonzero_mask(g[0]); + delta ^= swap & (delta ^ -delta); + delta += 1; + + for (i = 0; i < p + 1; ++i) { + t = swap & (f[i] ^ g[i]); + f[i] ^= (Fq) t; + g[i] ^= (Fq) t; + t = swap & (v[i] ^ r[i]); + v[i] ^= (Fq) t; + r[i] ^= (Fq) t; + } + + f0 = f[0]; + g0 = g[0]; + for (i = 0; i < p + 1; ++i) { + g[i] = Fq_bigfreeze(f0 * g[i] - g0 * f[i]); + } + for (i = 0; i < p + 1; ++i) { + r[i] = Fq_bigfreeze(f0 * r[i] - g0 * v[i]); + } + + for (i = 0; i < p; ++i) { + g[i] = g[i + 1]; + } + g[p] = 0; + } + + scale = Fq_recip(f[0]); + for (i = 0; i < p; ++i) { + out[i] = Fq_bigfreeze(scale * (int32)v[p - 1 - i]); + } + + crypto_encode_pxint16(outbytes, out); + outbytes[2 * p] = (unsigned char) int16_nonzero_mask((int16) delta); + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.h new file mode 100644 index 0000000000..c9e09ef402 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_invsntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INVSNTRUP761_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_INVSNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_OUTPUTBYTES 1523 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.c new file mode 100644 index 0000000000..b25892b41e --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.c @@ -0,0 +1,57 @@ +#include "crypto_core_mult3sntrup761.h" +#include "params.h" + + +#define int8 int8_t +#define int16 int16_t +typedef int8 small; + +/* works for -16384 <= x < 16384 */ +static small F3_freeze(int16 x) { + return (small) (x - 3 * ((10923 * x + 16384) >> 15)); +} + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) { + small *h = (void *) outbytes; + small f[p]; + small g[p]; + small fg[p + p - 1]; + int16 result; + int i, j; + + for (i = 0; i < p; ++i) { + small fi = (small) inbytes[i]; + small fi0 = fi & 1; + f[i] = (small) (fi0 - (fi & (fi0 << 1))); + } + for (i = 0; i < p; ++i) { + small gi = (small) kbytes[i]; + small gi0 = gi & 1; + g[i] = (small) (gi0 - (gi & (gi0 << 1))); + } + + for (i = 0; i < p; ++i) { + result = 0; + for (j = 0; j <= i; ++j) { + result += (small) (f[j] * g[i - j]); + } + fg[i] = F3_freeze(result); + } + for (i = p; i < p + p - 1; ++i) { + result = 0; + for (j = i - p + 1; j < p; ++j) { + result += (small) (f[j] * g[i - j]); + } + fg[i] = F3_freeze(result); + } + + for (i = p + p - 2; i >= p; --i) { + fg[i - p] = F3_freeze(fg[i - p] + fg[i]); + fg[i - p + 1] = F3_freeze(fg[i - p + 1] + fg[i]); + } + + for (i = 0; i < p; ++i) { + h[i] = fg[i]; + } + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.h new file mode 100644 index 0000000000..d06fb73cd3 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_mult3sntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULT3SNTRUP761_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULT3SNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_OUTPUTBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_KEYBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.c new file mode 100644 index 0000000000..54d3503a00 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.c @@ -0,0 +1,60 @@ +#include "crypto_core_multsntrup761.h" +#include "params.h" + + +#define int8 int8_t +#define int16 int16_t +#define int32 int32_t +typedef int8 small; + +typedef int16 Fq; +/* always represented as -(q-1)/2...(q-1)/2 */ + +/* works for -7000000 < x < 7000000 if q in 4591, 4621, 5167, 6343, 7177, 7879 */ +static Fq Fq_freeze(int32 x) { + x -= q * ((q18 * x) >> 18); + x -= q * ((q27 * x + 67108864) >> 27); + return (Fq) x; +} + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes) { + Fq f[p]; + small g[p]; + Fq fg[p + p - 1]; + int32 result; + int i, j; + + crypto_decode_pxint16(f, inbytes); + for (i = 0; i < p; ++i) { + f[i] = Fq_freeze(f[i]); + } + + for (i = 0; i < p; ++i) { + small gi = (small) kbytes[i]; + small gi0 = gi & 1; + g[i] = (small) (gi0 - (gi & (gi0 << 1))); + } + + for (i = 0; i < p; ++i) { + result = 0; + for (j = 0; j <= i; ++j) { + result += f[j] * (int32)g[i - j]; + } + fg[i] = Fq_freeze(result); + } + for (i = p; i < p + p - 1; ++i) { + result = 0; + for (j = i - p + 1; j < p; ++j) { + result += f[j] * (int32)g[i - j]; + } + fg[i] = Fq_freeze(result); + } + + for (i = p + p - 2; i >= p; --i) { + fg[i - p] = Fq_freeze(fg[i - p] + fg[i]); + fg[i - p + 1] = Fq_freeze(fg[i - p + 1] + fg[i]); + } + + crypto_encode_pxint16(outbytes, fg); + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.h new file mode 100644 index 0000000000..dc81268f95 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_multsntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULTSNTRUP761_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_MULTSNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_OUTPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_INPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_KEYBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761(unsigned char *outbytes, const unsigned char *inbytes, const unsigned char *kbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.c new file mode 100644 index 0000000000..f4da4e6fd7 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.c @@ -0,0 +1,32 @@ +#include "crypto_core_scale3sntrup761.h" +#include "crypto_decode_761xint16.h" +#include "crypto_encode_761xint16.h" + + +#define p 761 +#define q 4591 + +#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16 +#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16 + +typedef int16_t Fq; + +/* out = 3*in in Rq */ +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + Fq f[p]; + int i; + + crypto_decode_pxint16(f, inbytes); + for (i = 0; i < p; ++i) { + Fq x = f[i]; + x *= 3; /* (-3q+3)/2 ... (3q-3)/2 */ + x -= (q + 1) / 2; /* -2q+1 ... q-2 */ + x += q & (x >> 15); /* -q+1 ... q-1 */ + x += q & (x >> 15); /* 0 ... q-1 */ + x -= (q - 1) / 2; /* -(q-1)/2 ... (q-1)/2 */ + f[i] = x; + } + crypto_encode_pxint16(outbytes, f); + + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.h new file mode 100644 index 0000000000..712cdb5bf0 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_scale3sntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_SCALE3SNTRUP761_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_SCALE3SNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_OUTPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_INPUTBYTES 1522 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.c new file mode 100644 index 0000000000..3809abf054 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.c @@ -0,0 +1,21 @@ +#include "crypto_core_weightsntrup761.h" +#include "crypto_encode_int16.h" +#include "params.h" + + +#define int8 int8_t +#define int16 int16_t + + +/* out = little-endian weight of bottom bits of in */ +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + int8 *in = (void *) inbytes; + int16 weight = 0; + int i; + + for (i = 0; i < p; ++i) { + weight += in[i] & 1; + } + PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16(outbytes, &weight); + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.h new file mode 100644 index 0000000000..a1cef62fa4 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_weightsntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP761_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WEIGHTSNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_OUTPUTBYTES 2 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_weightsntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.c new file mode 100644 index 0000000000..01dc9ddd62 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.c @@ -0,0 +1,48 @@ +#include "crypto_core_wforcesntrup761.h" +#include "params.h" + + +#define int8 int8_t +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t + +typedef int8 small; + + +/* return -1 if x!=0; else return 0 */ +static int int16_nonzero_mask(int16 x) { + uint16 u = (uint16) x; /* 0, else 1...65535 */ + uint32 v = u; /* 0, else 1...65535 */ + v = ~v + 1; /* 0, else 2^32-65535...2^32-1 */ + v >>= 31; /* 0, else 1 */ + return -(int) v; /* 0, else -1 */ +} + +/* 0 if Weightw_is(r), else -1 */ +static int Weightw_mask(const small *r) { + int weight = 0; + int i; + + for (i = 0; i < p; ++i) { + weight += r[i] & 1; + } + return int16_nonzero_mask((int16) (weight - w)); +} + +/* out = in if bottom bits of in have weight w */ +/* otherwise out = (1,1,...,1,0,0,...,0) */ +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761(unsigned char *outbytes, const unsigned char *inbytes) { + small *out = (void *) outbytes; + const small *in = (const void *) inbytes; + int i, mask; + + mask = Weightw_mask(in); /* 0 if weight w, else -1 */ + for (i = 0; i < w; ++i) { + out[i] = (small) (((in[i] ^ 1) & ~mask) ^ 1); + } + for (i = w; i < p; ++i) { + out[i] = (small) (in[i] & ~mask); + } + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.h new file mode 100644 index 0000000000..42e8423191 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_core_wforcesntrup761.h @@ -0,0 +1,11 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WFORCESNTRUP761_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_CORE_WFORCESNTRUP761_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_OUTPUTBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_INPUTBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_KEYBYTES 0 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761_CONSTBYTES 0 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761(unsigned char *outbytes, const unsigned char *inbytes); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_declassify.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_declassify.h new file mode 100644 index 0000000000..a67a915c51 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_declassify.h @@ -0,0 +1,8 @@ +#ifndef crypto_declassify_h +#define crypto_declassify_h + + + +#define crypto_declassify(BUF,SIZE) + +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.c new file mode 100644 index 0000000000..bc72c01f9c --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.c @@ -0,0 +1,211 @@ +#include "crypto_decode_761x1531.h" + +/* auto-generated; do not edit */ + +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t +#define uint64 uint64_t + +/* +CPU division instruction typically takes time depending on x. +This software is designed to take time independent of x. +Time still varies depending on m; user must ensure that m is constant. +Time also varies on CPUs where multiplication is variable-time. +There could be more CPU issues. +There could also be compiler issues. +*/ + +static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) { + uint32 v = 0x80000000; + uint32 qpart; + uint32 mask; + + v /= m; + + /* caller guarantees m > 0 */ + /* caller guarantees m < 16384 */ + /* vm <= 2^31 <= vm+m-1 */ + /* xvm <= 2^31 x <= xvm+x(m-1) */ + + *q = 0; + + qpart = (uint32) ((x * (uint64)v) >> 31); + /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */ + /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */ + /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */ + /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */ + /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ + /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */ + + x -= qpart * m; + *q += qpart; + /* x <= 49146 */ + + qpart = (uint32) ((x * (uint64)v) >> 31); + /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ + /* 0 <= newx <= m + 49146(2^14-1)/2^31 */ + /* 0 <= newx <= m + 0.4 */ + /* 0 <= newx <= m */ + + x -= qpart * m; + *q += qpart; + /* x <= m */ + + x -= m; + *q += 1; + mask = (~(x >> 31) + 1); + x += mask & (uint32)m; + *q += mask; + /* x < m */ + + *r = (uint16) x; +} + +static uint16 uint32_mod_uint14(uint32 x, uint16 m) { + uint32 q; + uint16 r; + uint32_divmod_uint14(&q, &r, x, m); + return r; +} + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531(void *v, const unsigned char *s) { + int16 *R0 = v; + uint16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1]; + long long i; + uint16 r0; + uint32 r1, r2; + + s += PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_STRBYTES; + r1 = 0; + r1 = (r1 << 8) | *--s; + r1 = (r1 << 8) | *--s; + r1 = uint32_mod_uint14(r1, 3475); /* needed only for invalid inputs */ + R10[0] = (uint16) r1; + + r2 = R10[0]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 593); + R9[0] = r0; + r1 = uint32_mod_uint14(r1, 1500); /* needed only for invalid inputs */ + R9[1] = (uint16) r1; + + R8[2] = R9[1]; + r2 = R9[0]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 6232); + R8[0] = r0; + r1 = uint32_mod_uint14(r1, 6232); /* needed only for invalid inputs */ + R8[1] = (uint16) r1; + + r2 = R8[2]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1263); + R7[4] = r0; + r1 = uint32_mod_uint14(r1, 304); /* needed only for invalid inputs */ + R7[5] = (uint16) r1; + for (i = 1; i >= 0; --i) { + r2 = R8[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1263); + R7[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 1263); /* needed only for invalid inputs */ + R7[2 * i + 1] = (uint16) r1; + } + + r2 = R7[5]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 9097); + R6[10] = r0; + r1 = uint32_mod_uint14(r1, 2188); /* needed only for invalid inputs */ + R6[11] = (uint16) r1; + for (i = 4; i >= 0; --i) { + r2 = R7[i]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 9097); + R6[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 9097); /* needed only for invalid inputs */ + R6[2 * i + 1] = (uint16) r1; + } + + r2 = R6[11]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1526); + R5[22] = r0; + r1 = uint32_mod_uint14(r1, 367); /* needed only for invalid inputs */ + R5[23] = (uint16) r1; + for (i = 10; i >= 0; --i) { + r2 = R6[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1526); + R5[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 1526); /* needed only for invalid inputs */ + R5[2 * i + 1] = (uint16) r1; + } + + r2 = R5[23]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 625); + R4[46] = r0; + r1 = uint32_mod_uint14(r1, 150); /* needed only for invalid inputs */ + R4[47] = (uint16) r1; + for (i = 22; i >= 0; --i) { + r2 = R5[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 625); + R4[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 625); /* needed only for invalid inputs */ + R4[2 * i + 1] = (uint16) r1; + } + + r2 = R4[47]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 6400); + R3[94] = r0; + r1 = uint32_mod_uint14(r1, 1531); /* needed only for invalid inputs */ + R3[95] = (uint16) r1; + for (i = 46; i >= 0; --i) { + r2 = R4[i]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 6400); + R3[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 6400); /* needed only for invalid inputs */ + R3[2 * i + 1] = (uint16) r1; + } + + R2[190] = R3[95]; + for (i = 94; i >= 0; --i) { + r2 = R3[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1280); + R2[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 1280); /* needed only for invalid inputs */ + R2[2 * i + 1] = (uint16) r1; + } + + R1[380] = R2[190]; + for (i = 189; i >= 0; --i) { + r2 = R2[i]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 9157); + R1[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 9157); /* needed only for invalid inputs */ + R1[2 * i + 1] = (uint16) r1; + } + + R0[760] = (int16) (3 * R1[380] - 2295); + for (i = 379; i >= 0; --i) { + r2 = R1[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1531); + R0[2 * i] = (int16) (3 * r0 - 2295); + r1 = uint32_mod_uint14(r1, 1531); /* needed only for invalid inputs */ + R0[2 * i + 1] = (int16) (3 * r1 - 2295); + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.h new file mode 100644 index 0000000000..c55247c9c9 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x1531.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X1531_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X1531_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_STRBYTES 1007 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.c new file mode 100644 index 0000000000..d5d6950884 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.c @@ -0,0 +1,24 @@ +#include "crypto_decode_761x3.h" + +#define uint8 uint8_t + +#define p 761 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3(void *v, const unsigned char *s) { + uint8 *f = v; + uint8 x; + int i; + + for (i = 0; i < p / 4; ++i) { + x = *s++; + *f++ = (uint8) ((x & 3) - 1); + x >>= 2; + *f++ = (uint8) ((x & 3) - 1); + x >>= 2; + *f++ = (uint8) ((x & 3) - 1); + x >>= 2; + *f++ = (uint8) ((x & 3) - 1); + } + x = *s++; + *f++ = (uint8) ((x & 3) - 1); +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.h new file mode 100644 index 0000000000..acf9d9cc09 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x3.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X3_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X3_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3_STRBYTES 191 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3_ITEMBYTES 1 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.c new file mode 100644 index 0000000000..8402fcd276 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.c @@ -0,0 +1,211 @@ +#include "crypto_decode_761x4591.h" + +/* auto-generated; do not edit */ + +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t +#define uint64 uint64_t + +/* +CPU division instruction typically takes time depending on x. +This software is designed to take time independent of x. +Time still varies depending on m; user must ensure that m is constant. +Time also varies on CPUs where multiplication is variable-time. +There could be more CPU issues. +There could also be compiler issues. +*/ + +static void uint32_divmod_uint14(uint32 *q, uint16 *r, uint32 x, uint16 m) { + uint32 v = 0x80000000; + uint32 qpart; + uint32 mask; + + v /= m; + + /* caller guarantees m > 0 */ + /* caller guarantees m < 16384 */ + /* vm <= 2^31 <= vm+m-1 */ + /* xvm <= 2^31 x <= xvm+x(m-1) */ + + *q = 0; + + qpart = (uint32) ((x * (uint64)v) >> 31); + /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */ + /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */ + /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */ + /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */ + /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ + /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */ + + x -= qpart * m; + *q += qpart; + /* x <= 49146 */ + + qpart = (uint32) ((x * (uint64)v) >> 31); + /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */ + /* 0 <= newx <= m + 49146(2^14-1)/2^31 */ + /* 0 <= newx <= m + 0.4 */ + /* 0 <= newx <= m */ + + x -= qpart * m; + *q += qpart; + /* x <= m */ + + x -= m; + *q += 1; + mask = (~(x >> 31) + 1); + x += mask & (uint32)m; + *q += mask; + /* x < m */ + + *r = (uint16) x; +} + +static uint16 uint32_mod_uint14(uint32 x, uint16 m) { + uint32 q; + uint16 r; + uint32_divmod_uint14(&q, &r, x, m); + return r; +} + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591(void *v, const unsigned char *s) { + int16 *R0 = v; + uint16 R1[381], R2[191], R3[96], R4[48], R5[24], R6[12], R7[6], R8[3], R9[2], R10[1]; + long long i; + uint16 r0; + uint32 r1, r2; + + s += PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_STRBYTES; + r1 = 0; + r1 = (r1 << 8) | *--s; + r1 = (r1 << 8) | *--s; + r1 = uint32_mod_uint14(r1, 1608); /* needed only for invalid inputs */ + R10[0] = (uint16) r1; + + r2 = R10[0]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 9470); + R9[0] = r0; + r1 = uint32_mod_uint14(r1, 11127); /* needed only for invalid inputs */ + R9[1] = (uint16) r1; + + R8[2] = R9[1]; + r2 = R9[0]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1557); + R8[0] = r0; + r1 = uint32_mod_uint14(r1, 1557); /* needed only for invalid inputs */ + R8[1] = (uint16) r1; + + r2 = R8[2]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 10101); + R7[4] = r0; + r1 = uint32_mod_uint14(r1, 282); /* needed only for invalid inputs */ + R7[5] = (uint16) r1; + for (i = 1; i >= 0; --i) { + r2 = R8[i]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 10101); + R7[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 10101); /* needed only for invalid inputs */ + R7[2 * i + 1] = (uint16) r1; + } + + r2 = R7[5]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1608); + R6[10] = r0; + r1 = uint32_mod_uint14(r1, 11468); /* needed only for invalid inputs */ + R6[11] = (uint16) r1; + for (i = 4; i >= 0; --i) { + r2 = R7[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1608); + R6[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 1608); /* needed only for invalid inputs */ + R6[2 * i + 1] = (uint16) r1; + } + + r2 = R6[11]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 10265); + R5[22] = r0; + r1 = uint32_mod_uint14(r1, 286); /* needed only for invalid inputs */ + R5[23] = (uint16) r1; + for (i = 10; i >= 0; --i) { + r2 = R6[i]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 10265); + R5[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 10265); /* needed only for invalid inputs */ + R5[2 * i + 1] = (uint16) r1; + } + + r2 = R5[23]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1621); + R4[46] = r0; + r1 = uint32_mod_uint14(r1, 11550); /* needed only for invalid inputs */ + R4[47] = (uint16) r1; + for (i = 22; i >= 0; --i) { + r2 = R5[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 1621); + R4[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 1621); /* needed only for invalid inputs */ + R4[2 * i + 1] = (uint16) r1; + } + + r2 = R4[47]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 644); + R3[94] = r0; + r1 = uint32_mod_uint14(r1, 4591); /* needed only for invalid inputs */ + R3[95] = (uint16) r1; + for (i = 46; i >= 0; --i) { + r2 = R4[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 644); + R3[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 644); /* needed only for invalid inputs */ + R3[2 * i + 1] = (uint16) r1; + } + + R2[190] = R3[95]; + for (i = 94; i >= 0; --i) { + r2 = R3[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 406); + R2[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 406); /* needed only for invalid inputs */ + R2[2 * i + 1] = (uint16) r1; + } + + R1[380] = R2[190]; + for (i = 189; i >= 0; --i) { + r2 = R2[i]; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 322); + R1[2 * i] = r0; + r1 = uint32_mod_uint14(r1, 322); /* needed only for invalid inputs */ + R1[2 * i + 1] = (uint16) r1; + } + + R0[760] = (int16) (R1[380] - 2295); + for (i = 379; i >= 0; --i) { + r2 = R1[i]; + r2 = (r2 << 8) | *--s; + r2 = (r2 << 8) | *--s; + uint32_divmod_uint14(&r1, &r0, r2, 4591); + R0[2 * i] = (int16) (r0 - 2295); + r1 = uint32_mod_uint14(r1, 4591); /* needed only for invalid inputs */ + R0[2 * i + 1] = (int16) (r1 - 2295); + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.h new file mode 100644 index 0000000000..49b80ca6d3 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761x4591.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X4591_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761X4591_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_STRBYTES 1158 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.c new file mode 100644 index 0000000000..5d60473c04 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.c @@ -0,0 +1,15 @@ +#include "crypto_decode_761xint16.h" + + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16(void *v, const unsigned char *s) { + uint16_t *x = v; + int i; + + for (i = 0; i < 761; ++i) { + uint16_t u0 = s[0]; + uint16_t u1 = s[1]; + *x = (uint16_t) (u0 | (u1 << 8)); + x += 1; + s += 2; + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.h new file mode 100644 index 0000000000..28790c40d8 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint16.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT16_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT16_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16_STRBYTES 1522 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.c new file mode 100644 index 0000000000..71d4dcc4b5 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.c @@ -0,0 +1,20 @@ +#include "crypto_decode_761xint32.h" + + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32(void *v, const unsigned char *s) { + uint32_t *x = v; + int i; + + for (i = 0; i < 761; ++i) { + uint32_t u0 = s[0]; + uint32_t u1 = s[1]; + uint32_t u2 = s[2]; + uint32_t u3 = s[3]; + u1 <<= 8; + u2 <<= 16; + u3 <<= 24; + *x = u0 | u1 | u2 | u3; + x += 1; + s += 4; + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.h new file mode 100644 index 0000000000..c6ed070d6c --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_decode_761xint32.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT32_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_DECODE_761XINT32_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32_STRBYTES 3044 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32_ITEMBYTES 4 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32(void *v, const unsigned char *s); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.c new file mode 100644 index 0000000000..412fa7a468 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.c @@ -0,0 +1,119 @@ +#include "crypto_encode_761x1531.h" + +/* auto-generated; do not edit */ + +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531(unsigned char *out, const void *v) { + const int16 *R0 = v; + /* XXX: caller could overlap R with input */ + uint16 R[381]; + long i; + uint16 r0, r1; + uint32 r2; + + for (i = 0; i < 380; ++i) { + r0 = (uint16) ((((R0[2 * i] + 2295) & 16383) * 10923) >> 15); + r1 = (uint16) ((((R0[2 * i + 1] + 2295) & 16383) * 10923) >> 15); + r2 = r0 + r1 * (uint32)1531; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + R[380] = (uint16) ((((R0[760] + 2295) & 16383) * 10923) >> 15); + + for (i = 0; i < 190; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)9157; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + R[190] = R[380]; + + for (i = 0; i < 95; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1280; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + R[95] = R[190]; + + for (i = 0; i < 48; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)6400; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + for (i = 0; i < 24; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)625; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + for (i = 0; i < 12; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1526; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + for (i = 0; i < 6; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)9097; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + for (i = 0; i < 3; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1263; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)6232; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + R[1] = R[2]; + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)593; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + + r0 = R[0]; + *out++ = (unsigned char) r0; + r0 >>= 8; + *out++ = (unsigned char) r0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.h new file mode 100644 index 0000000000..cc4f332c4e --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531_STRBYTES 1007 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531(unsigned char *out, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.c new file mode 100644 index 0000000000..6906d3e2e3 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.c @@ -0,0 +1,17 @@ +#include "crypto_encode_761x1531.h" +#include "crypto_encode_761x1531round.h" + +#define int16 int16_t + +#define p 761 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round(unsigned char *out, const void *v) { + const int16 *a = v; + int16 x[p]; + int i; + + for (i = 0; i < p; ++i) { + x[i] = (int16) (3 * ((10923 * a[i] + 16384) >> 15)); + } + PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531(out, x); +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.h new file mode 100644 index 0000000000..64730f1f22 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x1531round.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531ROUND_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X1531ROUND_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round_STRBYTES 1007 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round(unsigned char *out, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.c new file mode 100644 index 0000000000..54deace03a --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.c @@ -0,0 +1,21 @@ +#include "crypto_encode_761x3.h" + +#define uint8 uint8_t + +#define p 761 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3(unsigned char *s, const void *v) { + const uint8 *f = v; + uint8 x; + int i; + + for (i = 0; i < p / 4; ++i) { + x = *f++ + 1; + x += (*f++ + 1) << 2; + x += (*f++ + 1) << 4; + x += (*f++ + 1) << 6; + *s++ = x; + } + x = *f++ + 1; + *s++ = x; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.h new file mode 100644 index 0000000000..e5ab1b175a --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x3.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X3_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X3_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_STRBYTES 191 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_ITEMBYTES 1 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3(unsigned char *s, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.c new file mode 100644 index 0000000000..c73645398b --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.c @@ -0,0 +1,147 @@ +#include "crypto_encode_761x4591.h" + +/* auto-generated; do not edit */ + +#define int16 int16_t +#define uint16 uint16_t +#define uint32 uint32_t + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591(unsigned char *out, const void *v) { + const int16 *R0 = v; + /* XXX: caller could overlap R with input */ + uint16 R[381]; + long i; + uint16 r0, r1; + uint32 r2; + + for (i = 0; i < 380; ++i) { + r0 = (uint16) ((R0[2 * i] + 2295) & 16383); + r1 = (uint16) ((R0[2 * i + 1] + 2295) & 16383); + r2 = r0 + r1 * (uint32)4591; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + R[380] = (uint16) ((R0[760] + 2295) & 16383); + + for (i = 0; i < 190; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)322; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + R[190] = R[380]; + + for (i = 0; i < 95; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)406; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + R[95] = R[190]; + + for (i = 0; i < 48; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)644; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + + for (i = 0; i < 23; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1621; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + r0 = R[46]; + r1 = R[47]; + r2 = r0 + r1 * (uint32)1621; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[23] = (uint16) r2; + + for (i = 0; i < 11; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)10265; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + r0 = R[22]; + r1 = R[23]; + r2 = r0 + r1 * (uint32)10265; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[11] = (uint16) r2; + + for (i = 0; i < 5; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)1608; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + r0 = R[10]; + r1 = R[11]; + r2 = r0 + r1 * (uint32)1608; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[5] = (uint16) r2; + + for (i = 0; i < 2; ++i) { + r0 = R[2 * i]; + r1 = R[2 * i + 1]; + r2 = r0 + r1 * (uint32)10101; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[i] = (uint16) r2; + } + r0 = R[4]; + r1 = R[5]; + r2 = r0 + r1 * (uint32)10101; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[2] = (uint16) r2; + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)1557; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + R[1] = R[2]; + + r0 = R[0]; + r1 = R[1]; + r2 = r0 + r1 * (uint32)9470; + *out++ = (unsigned char) r2; + r2 >>= 8; + *out++ = (unsigned char) r2; + r2 >>= 8; + R[0] = (uint16) r2; + + r0 = R[0]; + *out++ = (unsigned char) r0; + r0 >>= 8; + *out++ = (unsigned char) r0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.h new file mode 100644 index 0000000000..b1d5f998ad --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761x4591.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X4591_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761X4591_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_STRBYTES 1158 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591(unsigned char *out, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.c new file mode 100644 index 0000000000..f196e033a5 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.c @@ -0,0 +1,25 @@ +#include "crypto_encode_761xfreeze3.h" + +#define int16 int16_t + +#define p 761 + +/* valid inputs: -16384 <= x < 16384 */ +/* then 3 divides x-F3_freeze(x) */ +/* and F3_freeze(x) is in {-1,0,1} */ + +/* all inputs: 3 divides x-F3_freeze(x) */ +/* and F3_freeze(x) is in {-2,-1,0,1,2} */ + +static inline unsigned char F3_freeze(int16 x) { + return (unsigned char) (x - 3 * ((10923 * x + 16384) >> 15)); +} + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3(unsigned char *s, const void *v) { + const int16 *r = v; + + int i; + for (i = 0; i < p; ++i) { + s[i] = F3_freeze(r[i]); + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.h new file mode 100644 index 0000000000..14517f1312 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xfreeze3.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XFREEZE3_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XFREEZE3_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3_STRBYTES 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3(unsigned char *s, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.c new file mode 100644 index 0000000000..c5dcf16a86 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.c @@ -0,0 +1,13 @@ +#include "crypto_encode_761xint16.h" + + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16(unsigned char *s, const void *v) { + const uint16_t *x = v; + int i; + + for (i = 0; i < 761; ++i) { + uint16_t u = *x++; + *s++ = (unsigned char) u; + *s++ = (unsigned char) (u >> 8); + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.h new file mode 100644 index 0000000000..1829942f8e --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_761xint16.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XINT16_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_761XINT16_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16_STRBYTES 1522 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16_ITEMS 761 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16(unsigned char *s, const void *v); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.c new file mode 100644 index 0000000000..78b0d68277 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.c @@ -0,0 +1,9 @@ +#include "crypto_encode_int16.h" + +#define uint16 uint16_t + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16(unsigned char *s, const void *x) { + uint16 u = *(const uint16 *) x; + s[0] = (unsigned char) u; + s[1] = (unsigned char) (u >> 8); +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.h new file mode 100644 index 0000000000..9c9f100d2c --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_encode_int16.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_INT16_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_ENCODE_INT16_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16_STRBYTES 2 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16_ITEMS 1 +#define PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16_ITEMBYTES 2 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_encode_int16(unsigned char *s, const void *x); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.c new file mode 100644 index 0000000000..014421db98 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.c @@ -0,0 +1,84 @@ +#include "crypto_sort_int32.h" +#include + +#define int32 int32_t + +#define int32_MINMAX(a,b) \ + do { \ + int32_t ab = (b) ^ (a); \ + int32_t c = (int32_t)((int64_t)(b) - (int64_t)(a)); \ + c ^= ab & (c ^ (b)); \ + c >>= 31; \ + c &= ab; \ + (a) ^= c; \ + (b) ^= c; \ + } while(0) + +/* assume 2 <= n <= 0x40000000 */ +void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_int32(int32_t *x, long long n) { + int32 top, p, q, r, i; + long long j; + + top = 1; + while (top < n - top) { + top += top; + } + + for (p = top; p >= 1; p >>= 1) { + i = 0; + while (i + 2 * p <= n) { + for (j = i; j < i + p; ++j) { + int32_MINMAX(x[j], x[j + p]); + } + i += 2 * p; + } + for (j = i; j < n - p; ++j) { + int32_MINMAX(x[j], x[j + p]); + } + + i = 0; + j = 0; + for (q = top; q > p; q >>= 1) { + if (j != i) { + for (;;) { + if (j == n - q) { + goto done; + } + int32 a = x[j + p]; + for (r = q; r > p; r >>= 1) { + int32_MINMAX(a, x[j + r]); + } + x[j + p] = a; + ++j; + if (j == i + p) { + i += 2 * p; + break; + } + } + } + while (i + p <= n - q) { + for (j = i; j < i + p; ++j) { + int32 a = x[j + p]; + for (r = q; r > p; r >>= 1) { + int32_MINMAX(a, x[j + r]); + } + x[j + p] = a; + } + i += 2 * p; + } + /* now i + p > n - q */ + j = i; + while (j < n - q) { + int32 a = x[j + p]; + for (r = q; r > p; r >>= 1) { + int32_MINMAX(a, x[j + r]); + } + x[j + p] = a; + ++j; + } + +done: + ; + } + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.h new file mode 100644 index 0000000000..e603a7ccaa --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_int32.h @@ -0,0 +1,8 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_INT32_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_INT32_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_sort_int32_BYTES 4 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_int32(int32_t *x, long long n); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.c new file mode 100644 index 0000000000..8ce0184d6d --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.c @@ -0,0 +1,18 @@ +#include "crypto_sort_int32.h" +#include "crypto_sort_uint32.h" + + +/* can save time by vectorizing xor loops */ +/* can save time by integrating xor loops with int32_sort */ + +void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_uint32(void *array, long long n) { + uint32_t *x = array; + long long j; + for (j = 0; j < n; ++j) { + x[j] ^= 0x80000000; + } + PQCLEAN_SNTRUP761_CLEAN_crypto_sort_int32((int32_t *)array, n); + for (j = 0; j < n; ++j) { + x[j] ^= 0x80000000; + } +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.h new file mode 100644 index 0000000000..8d62e22dca --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_sort_uint32.h @@ -0,0 +1,8 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_UINT32_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_SORT_UINT32_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_sort_uint32_BYTES 4 + +void PQCLEAN_SNTRUP761_CLEAN_crypto_sort_uint32(void *array, long long n); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.c b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.c new file mode 100644 index 0000000000..9307fa4566 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.c @@ -0,0 +1,13 @@ +#include "crypto_verify_1039.h" + + +int PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039(const unsigned char *x, const unsigned char *y) { + unsigned int differentbits = 0; + int i; + + for (i = 0; i < PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039_BYTES; ++i) { + differentbits |= x[i] ^ y[i]; + } + + return (int) (1 & ((differentbits - 1) >> 8)) - 1; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.h b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.h new file mode 100644 index 0000000000..f3ad9ba699 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/crypto_verify_1039.h @@ -0,0 +1,8 @@ +#ifndef PQCLEAN_SNTRUP761_CLEAN_CRYPTO_VERIFY_1039_H +#define PQCLEAN_SNTRUP761_CLEAN_CRYPTO_VERIFY_1039_H + +#include +#define PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039_BYTES 1039 + +int PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039(const unsigned char *x, const unsigned char *y); +#endif diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/kem.c b/src/kem/ntruprime/pqclean_sntrup761_clean/kem.c new file mode 100644 index 0000000000..4c1995c7b3 --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/kem.c @@ -0,0 +1,251 @@ +#include "api.h" +#include "crypto_declassify.h" +#include "crypto_sort_uint32.h" +#include "params.h" +#include "randombytes.h" +#include "sha2.h" + + + +#define int8 int8_t +#define int16 int16_t +#define int32 int32_t +#define uint16 uint16_t +#define uint32 uint32_t + +/* ----- arithmetic mod 3 */ + +typedef int8 small; +/* F3 is always represented as -1,0,1 */ + +/* ----- arithmetic mod q */ + +typedef int16 Fq; +/* always represented as -(q-1)/2...(q-1)/2 */ + +/* ----- small polynomials */ + +/* R3_fromR(R_fromRq(r)) */ +static void R3_fromRq(small *out, const Fq *r) { + crypto_encode_pxfreeze3((unsigned char *) out, (unsigned char *) r); +} + +/* h = f*g in the ring R3 */ +static void R3_mult(small *h, const small *f, const small *g) { + crypto_core_mult3((unsigned char *) h, (const unsigned char *) f, (const unsigned char *) g); +} + +/* ----- polynomials mod q */ + +/* h = h*g in the ring Rq */ +static void Rq_mult_small(Fq *h, const small *g) { + crypto_encode_pxint16((unsigned char *) h, h); + crypto_core_mult((unsigned char *) h, (const unsigned char *) h, (const unsigned char *) g); + crypto_decode_pxint16(h, (const unsigned char *) h); +} + +/* h = 3f in Rq */ +static void Rq_mult3(Fq *h, const Fq *f) { + crypto_encode_pxint16((unsigned char *) h, f); + crypto_core_scale3((unsigned char *) h, (const unsigned char *) h); + crypto_decode_pxint16(h, (const unsigned char *) h); +} + +/* out = 1/(3*in) in Rq */ +/* caller must have 2p+1 bytes free in out, not just 2p */ +static void Rq_recip3(Fq *out, const small *in) { + crypto_core_inv((unsigned char *) out, (const unsigned char *) in); + /* could check byte 2*p for failure; but, in context, inv always works */ + crypto_decode_pxint16(out, (unsigned char *) out); +} + +/* ----- underlying hash function */ + +#define Hash_bytes 32 + +static void Hash(unsigned char *out, const unsigned char *in, int inlen) { + unsigned char h[64]; + int i; + sha512(h, in, (size_t) inlen); + for (i = 0; i < 32; ++i) { + out[i] = h[i]; + } +} + +/* ----- higher-level randomness */ + +static void Short_random(small *out) { + uint32 L[ppadsort]; + int i; + + randombytes((unsigned char *) L, 4 * p); + crypto_decode_pxint32(L, (unsigned char *) L); + for (i = 0; i < w; ++i) { + L[i] = L[i] & (uint32) - 2; + } + for (i = w; i < p; ++i) { + L[i] = (L[i] & (uint32) - 3) | 1; + } + for (i = p; i < ppadsort; ++i) { + L[i] = 0xffffffff; + } + PQCLEAN_SNTRUP761_CLEAN_crypto_sort_uint32(L, ppadsort); + for (i = 0; i < p; ++i) { + out[i] = (small) ((L[i] & 3) - 1); + } +} + +static void Small_random(small *out) { + uint32 L[p]; + int i; + + randombytes((unsigned char *) L, sizeof L); + crypto_decode_pxint32(L, (unsigned char *) L); + for (i = 0; i < p; ++i) { + out[i] = (small) ((((L[i] & 0x3fffffff) * 3) >> 30) - 1); + } +} + +/* ----- Streamlined NTRU Prime */ + +typedef small Inputs[p]; /* passed by reference */ +#define Ciphertexts_bytes Rounded_bytes +#define SecretKeys_bytes (2*Small_bytes) +#define PublicKeys_bytes Rq_bytes +#define Confirm_bytes 32 + +/* c,r_enc[1:] = Hide(r,pk,cache); cache is Hash4(pk) */ +/* also set r_enc[0]=3 */ +/* also set x[0]=2, and x[1:1+Hash_bytes] = Hash3(r_enc) */ +/* also overwrite x[1+Hash_bytes:1+2*Hash_bytes] */ +static void Hide(unsigned char *x, unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) { + Fq h[p]; + int i; + + Small_encode(r_enc + 1, r); + Rq_decode(h, pk); + Rq_mult_small(h, r); + Round_and_encode(c, h); + r_enc[0] = 3; + Hash(x + 1, r_enc, 1 + Small_bytes); + for (i = 0; i < Hash_bytes; ++i) { + x[1 + Hash_bytes + i] = cache[i]; + } + x[0] = 2; + Hash(c + Ciphertexts_bytes, x, 1 + Hash_bytes * 2); +} + + +int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { + small g[p]; + for (;;) { + Small_random(g); + { + small v[p + 1]; + small vp; + crypto_core_inv3((unsigned char *) v, (const unsigned char *) g); + vp = v[p]; + crypto_declassify(&vp, sizeof vp); + if (vp == 0) { + Small_encode(sk + Small_bytes, v); + break; + } + } + } + { + small f[p]; + Short_random(f); + Small_encode(sk, f); + { + Fq h[p + 1]; + Rq_recip3(h, f); /* always works */ + Rq_mult_small(h, g); + Rq_encode(pk, h); + } + } + { + int i; + unsigned char sksave = sk[SecretKeys_bytes - 1]; + for (i = 0; i < PublicKeys_bytes; ++i) { + sk[SecretKeys_bytes + i] = pk[i]; + } + sk[SecretKeys_bytes - 1] = 4; + Hash(sk + SecretKeys_bytes + PublicKeys_bytes + Small_bytes, sk + SecretKeys_bytes - 1, 1 + PublicKeys_bytes); + sk[SecretKeys_bytes - 1] = sksave; + randombytes(sk + SecretKeys_bytes + PublicKeys_bytes, Small_bytes); + } + return 0; +} + +int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + unsigned char cache[Hash_bytes]; + int i; + { + unsigned char y[1 + PublicKeys_bytes]; /* XXX: can eliminate with incremental hashing */ + for (i = 0; i < PublicKeys_bytes; ++i) { + y[1 + i] = pk[i]; + } + y[0] = 4; + Hash(cache, y, sizeof y); + } + { + Inputs r; + Short_random(r); + { + unsigned char r_enc[Small_bytes + 1]; + unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes]; + Hide(x, c, r_enc, r, pk, cache); + for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { + x[1 + Hash_bytes + i] = c[i]; + } + x[0] = 1; + Hash(k, x, sizeof x); + } + } + return 0; +} + +int PQCLEAN_SNTRUP761_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { + const unsigned char *pk = sk + SecretKeys_bytes; + const unsigned char *rho = pk + PublicKeys_bytes; + const unsigned char *cache = rho + Small_bytes; + int mask, i; + Inputs r; + { + Fq d[p]; + Rounded_decode(d, c); + { + small f[p]; + Small_decode(f, sk); + Rq_mult_small(d, f); + Rq_mult3(d, d); + } + { + small e[p]; + small v[p]; + R3_fromRq(e, d); + Small_decode(v, sk + Small_bytes); + R3_mult(r, e, v); + } + crypto_core_wforce((unsigned char *) r, (unsigned char *) r); + } + { + unsigned char r_enc[1 + Small_bytes]; + unsigned char cnew[Ciphertexts_bytes + Confirm_bytes]; + unsigned char x[1 + Hash_bytes + Ciphertexts_bytes + Confirm_bytes]; + /* XXX: can use incremental hashing to reduce x size */ + + Hide(x, cnew, r_enc, r, pk, cache); + mask = crypto_verify_clen(c, cnew); + for (i = 0; i < Small_bytes; ++i) { + r_enc[i + 1] ^= (unsigned char) (mask & (r_enc[i + 1] ^ rho[i])); + } + Hash(x + 1, r_enc, 1 + Small_bytes); /* XXX: can instead do cmov on cached hash of rho */ + for (i = 0; i < Ciphertexts_bytes + Confirm_bytes; ++i) { + x[1 + Hash_bytes + i] = c[i]; + } + x[0] = (unsigned char) (1 + mask); + Hash(k, x, sizeof x); + } + return 0; +} diff --git a/src/kem/ntruprime/pqclean_sntrup761_clean/params.h b/src/kem/ntruprime/pqclean_sntrup761_clean/params.h new file mode 100644 index 0000000000..4b8adb499c --- /dev/null +++ b/src/kem/ntruprime/pqclean_sntrup761_clean/params.h @@ -0,0 +1,72 @@ +#ifndef params_H +#define params_H +#include "crypto_core_inv3sntrup761.h" +#include "crypto_core_invsntrup761.h" +#include "crypto_core_mult3sntrup761.h" +#include "crypto_core_multsntrup761.h" +#include "crypto_core_scale3sntrup761.h" +#include "crypto_core_weightsntrup761.h" +#include "crypto_core_wforcesntrup761.h" +#include "crypto_decode_761x1531.h" +#include "crypto_decode_761x3.h" +#include "crypto_decode_761x4591.h" +#include "crypto_decode_761xint16.h" +#include "crypto_decode_761xint32.h" +#include "crypto_encode_761x1531.h" +#include "crypto_encode_761x1531round.h" +#include "crypto_encode_761x3.h" +#include "crypto_encode_761x4591.h" +#include "crypto_encode_761xfreeze3.h" +#include "crypto_encode_761xint16.h" +#include "crypto_encode_int16.h" +#include "crypto_sort_int32.h" +#include "crypto_sort_uint32.h" +#include "crypto_verify_1039.h" + + +#define q31 467759 /* floor(2^31/q) */ +#define q27 29235 /* closest integer to 2^27/q */ +#define q18 57 /* closest integer to 2^18/q */ +#define q14 4 /* closest integer to 2^14/q */ +#define p 761 +#define q 4591 +#define w 286 + +#define ppadsort 761 + +#define crypto_verify_clen PQCLEAN_SNTRUP761_CLEAN_crypto_verify_1039 + +#define Rq_bytes PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591_STRBYTES +#define Rq_encode PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x4591 +#define Rq_decode PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x4591 + +#define Rounded_bytes PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531_STRBYTES +#define Rounded_decode PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x1531 + +#define Round_and_encode PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x1531round + +#define Small_bytes PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3_STRBYTES +#define Small_encode PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761x3 +#define Small_decode PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761x3 + +#define crypto_encode_pxfreeze3 PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xfreeze3 + +#define crypto_decode_pxint32 PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint32 + +#define crypto_decode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_decode_761xint16 + +#define crypto_encode_pxint16 PQCLEAN_SNTRUP761_CLEAN_crypto_encode_761xint16 + +#define crypto_core_wforce PQCLEAN_SNTRUP761_CLEAN_crypto_core_wforcesntrup761 + +#define crypto_core_scale3 PQCLEAN_SNTRUP761_CLEAN_crypto_core_scale3sntrup761 + +#define crypto_core_inv PQCLEAN_SNTRUP761_CLEAN_crypto_core_invsntrup761 + +#define crypto_core_inv3 PQCLEAN_SNTRUP761_CLEAN_crypto_core_inv3sntrup761 + +#define crypto_core_mult3 PQCLEAN_SNTRUP761_CLEAN_crypto_core_mult3sntrup761 + +#define crypto_core_mult PQCLEAN_SNTRUP761_CLEAN_crypto_core_multsntrup761 + +#endif diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake index 8ff7a55555..74b581cf82 100644 --- a/src/oqsconfig.h.cmake +++ b/src/oqsconfig.h.cmake @@ -111,6 +111,10 @@ #cmakedefine OQS_ENABLE_KEM_kyber_1024_90s 1 #cmakedefine OQS_ENABLE_KEM_kyber_1024_90s_avx2 1 +#cmakedefine OQS_ENABLE_KEM_NTRUPRIME 1 +#cmakedefine OQS_ENABLE_KEM_ntruprime_sntrup761 1 +#cmakedefine OQS_ENABLE_KEM_ntruprime_sntrup761_avx2 1 + #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1 #cmakedefine OQS_ENABLE_SIG_dilithium_2_avx2 1 diff --git a/tests/KATs/kem/kats.json b/tests/KATs/kem/kats.json index 2677741d65..8da01994c8 100644 --- a/tests/KATs/kem/kats.json +++ b/tests/KATs/kem/kats.json @@ -29,5 +29,6 @@ "Kyber512": "bb0481d3325d828817900b709d23917cefbc10026fc857f098979451f67bb0ca", "Kyber512-90s": "7bfe0653b63b3fac7ee300a6e4801046c1a3d8d445b271633b6c9d81ed125e5b", "Kyber768": "89e82a5bf2d4ddb2c6444e10409e6d9ca65dafbca67d1a0db2c9b54920a29172", - "Kyber768-90s": "68bf2e3914c0b4e053cefc67dd9f10f567946da5720f0b453b347610c3cc2c0a" + "Kyber768-90s": "68bf2e3914c0b4e053cefc67dd9f10f567946da5720f0b453b347610c3cc2c0a", + "sntrup761": "afc42c3a5b10f4ef69654250097ebda9b9564570f4086744b24a6daf2bd1f89a" } diff --git a/tests/constant_time/kem/issues.json b/tests/constant_time/kem/issues.json index ecef7fb0a5..fa5bf711ad 100644 --- a/tests/constant_time/kem/issues.json +++ b/tests/constant_time/kem/issues.json @@ -25,5 +25,6 @@ "Kyber512": [], "Kyber512-90s": [], "Kyber768": [], - "Kyber768-90s": [] + "Kyber768-90s": [], + "sntrup761": [] } diff --git a/tests/constant_time/kem/passes.json b/tests/constant_time/kem/passes.json index f6252c9971..4e542e49c1 100644 --- a/tests/constant_time/kem/passes.json +++ b/tests/constant_time/kem/passes.json @@ -25,5 +25,6 @@ "Kyber512": ["kyber"], "Kyber512-90s": ["kyber-90s"], "Kyber768": ["kyber"], - "Kyber768-90s": ["kyber-90s"] + "Kyber768-90s": ["kyber-90s"], + "sntrup761": ["sntrup"] } diff --git a/tests/constant_time/kem/passes/sntrup b/tests/constant_time/kem/passes/sntrup new file mode 100644 index 0000000000..f811e8dd4f --- /dev/null +++ b/tests/constant_time/kem/passes/sntrup @@ -0,0 +1,7 @@ +{ + Rejection sampling on non-invertible g + Memcheck:Cond + src:kem.c:149 # fun:PQCLEAN_SNTRUP*_CLEAN_crypto_kem_keypair + fun:OQS_KEM_ntruprime_sntrup*_keypair + fun:OQS_KEM_keypair +}