diff --git a/docs/algorithms/sig/mayo.md b/docs/algorithms/sig/mayo.md index fc313da3d9..4132a3b254 100644 --- a/docs/algorithms/sig/mayo.md +++ b/docs/algorithms/sig/mayo.md @@ -6,7 +6,7 @@ - **Authors' website**: https://pqmayo.org - **Specification version**: 1.1. - **Primary Source**: - - **Source**: https://github.com/PQCMayo/MAYO-C/commit/5002c0c1772b7ea7eb75da14d8b12c42903f5fda with copy_from_upstream patches + - **Source**: https://github.com/PQCMayo/MAYO-C/commit/4694f8b2b3002d6a987a7f7323cd01a44bdc7225 with copy_from_upstream patches - **Implementation license (SPDX-Identifier)**: Apache-2.0 diff --git a/docs/algorithms/sig/mayo.yml b/docs/algorithms/sig/mayo.yml index 1dcfbb732d..04edb95fe9 100644 --- a/docs/algorithms/sig/mayo.yml +++ b/docs/algorithms/sig/mayo.yml @@ -11,7 +11,7 @@ website: https://pqmayo.org nist-round: 1 spec-version: 1.1 primary-upstream: - source: https://github.com/PQCMayo/MAYO-C/commit/5002c0c1772b7ea7eb75da14d8b12c42903f5fda + source: https://github.com/PQCMayo/MAYO-C/commit/4694f8b2b3002d6a987a7f7323cd01a44bdc7225 with copy_from_upstream patches spdx-license-identifier: Apache-2.0 parameter-sets: diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index bd762f0371..44ddf5072b 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -57,7 +57,7 @@ upstreams: name: pqmayo git_url: https://github.com/PQCMayo/MAYO-C.git git_branch: bhe-liboqs-integration - git_commit: 5002c0c1772b7ea7eb75da14d8b12c42903f5fda + git_commit: 4694f8b2b3002d6a987a7f7323cd01a44bdc7225 sig_meta_path: 'META/{pretty_name_full}_META.yml' sig_scheme_path: '.' patches: [pqmayo-aes.patch, pqmayo-mem.patch] diff --git a/src/sig/mayo/CMakeLists.txt b/src/sig/mayo/CMakeLists.txt index e82f04180c..612390dd2d 100644 --- a/src/sig/mayo/CMakeLists.txt +++ b/src/sig/mayo/CMakeLists.txt @@ -6,7 +6,7 @@ set(_MAYO_OBJS "") if(OQS_ENABLE_SIG_mayo_1) - add_library(mayo_1_opt OBJECT sig_mayo_1.c pqmayo_mayo_1_opt/aes128ctr.c pqmayo_mayo_1_opt/api.c pqmayo_mayo_1_opt/arithmetic.c pqmayo_mayo_1_opt/mayo.c pqmayo_mayo_1_opt/params.c) + add_library(mayo_1_opt OBJECT sig_mayo_1.c pqmayo_mayo_1_opt/api.c pqmayo_mayo_1_opt/arithmetic.c pqmayo_mayo_1_opt/mayo.c pqmayo_mayo_1_opt/params.c) target_compile_options(mayo_1_opt PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL) target_include_directories(mayo_1_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_1_opt) target_include_directories(mayo_1_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) @@ -15,7 +15,7 @@ if(OQS_ENABLE_SIG_mayo_1) endif() if(OQS_ENABLE_SIG_mayo_1_avx2) - add_library(mayo_1_avx2 OBJECT pqmayo_mayo_1_avx2/aes128ctr.c pqmayo_mayo_1_avx2/api.c pqmayo_mayo_1_avx2/arithmetic.c pqmayo_mayo_1_avx2/mayo.c pqmayo_mayo_1_avx2/params.c) + add_library(mayo_1_avx2 OBJECT pqmayo_mayo_1_avx2/api.c pqmayo_mayo_1_avx2/arithmetic.c pqmayo_mayo_1_avx2/mayo.c pqmayo_mayo_1_avx2/params.c) target_include_directories(mayo_1_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_1_avx2) target_include_directories(mayo_1_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(mayo_1_avx2 PRIVATE -mavx2) @@ -24,7 +24,7 @@ if(OQS_ENABLE_SIG_mayo_1_avx2) endif() if(OQS_ENABLE_SIG_mayo_2) - add_library(mayo_2_opt OBJECT sig_mayo_2.c pqmayo_mayo_2_opt/aes128ctr.c pqmayo_mayo_2_opt/api.c pqmayo_mayo_2_opt/arithmetic.c pqmayo_mayo_2_opt/mayo.c pqmayo_mayo_2_opt/params.c) + add_library(mayo_2_opt OBJECT sig_mayo_2.c pqmayo_mayo_2_opt/api.c pqmayo_mayo_2_opt/arithmetic.c pqmayo_mayo_2_opt/mayo.c pqmayo_mayo_2_opt/params.c) target_compile_options(mayo_2_opt PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL) target_include_directories(mayo_2_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_2_opt) target_include_directories(mayo_2_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) @@ -33,7 +33,7 @@ if(OQS_ENABLE_SIG_mayo_2) endif() if(OQS_ENABLE_SIG_mayo_2_avx2) - add_library(mayo_2_avx2 OBJECT pqmayo_mayo_2_avx2/aes128ctr.c pqmayo_mayo_2_avx2/api.c pqmayo_mayo_2_avx2/arithmetic.c pqmayo_mayo_2_avx2/mayo.c pqmayo_mayo_2_avx2/params.c) + add_library(mayo_2_avx2 OBJECT pqmayo_mayo_2_avx2/api.c pqmayo_mayo_2_avx2/arithmetic.c pqmayo_mayo_2_avx2/mayo.c pqmayo_mayo_2_avx2/params.c) target_include_directories(mayo_2_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_2_avx2) target_include_directories(mayo_2_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(mayo_2_avx2 PRIVATE -mavx2) @@ -42,7 +42,7 @@ if(OQS_ENABLE_SIG_mayo_2_avx2) endif() if(OQS_ENABLE_SIG_mayo_3) - add_library(mayo_3_opt OBJECT sig_mayo_3.c pqmayo_mayo_3_opt/aes128ctr.c pqmayo_mayo_3_opt/api.c pqmayo_mayo_3_opt/arithmetic.c pqmayo_mayo_3_opt/mayo.c pqmayo_mayo_3_opt/params.c) + add_library(mayo_3_opt OBJECT sig_mayo_3.c pqmayo_mayo_3_opt/api.c pqmayo_mayo_3_opt/arithmetic.c pqmayo_mayo_3_opt/mayo.c pqmayo_mayo_3_opt/params.c) target_compile_options(mayo_3_opt PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) target_include_directories(mayo_3_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_3_opt) target_include_directories(mayo_3_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) @@ -51,7 +51,7 @@ if(OQS_ENABLE_SIG_mayo_3) endif() if(OQS_ENABLE_SIG_mayo_3_avx2) - add_library(mayo_3_avx2 OBJECT pqmayo_mayo_3_avx2/aes128ctr.c pqmayo_mayo_3_avx2/api.c pqmayo_mayo_3_avx2/arithmetic.c pqmayo_mayo_3_avx2/mayo.c pqmayo_mayo_3_avx2/params.c) + add_library(mayo_3_avx2 OBJECT pqmayo_mayo_3_avx2/api.c pqmayo_mayo_3_avx2/arithmetic.c pqmayo_mayo_3_avx2/mayo.c pqmayo_mayo_3_avx2/params.c) target_include_directories(mayo_3_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_3_avx2) target_include_directories(mayo_3_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(mayo_3_avx2 PRIVATE -mavx2) diff --git a/src/sig/mayo/pqmayo_mayo_1_avx2/aes128ctr.c b/src/sig/mayo/pqmayo_mayo_1_avx2/aes128ctr.c deleted file mode 100644 index 9382136337..0000000000 --- a/src/sig/mayo/pqmayo_mayo_1_avx2/aes128ctr.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 and MIT and Public Domain - -#ifdef ENABLE_AESNI - -#include -#include -#include -#include -#include - -// Adapted from liboqs/src/common/aes which in turn takes it from: -// crypto_core/aes128ncrypt/dolbeau/aesenc-int -// (https://bench.cr.yp.to/supercop.html) -static inline void aes128ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[11]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - BLOCK1(0x10); - BLOCK1(0x20); - BLOCK1(0x40); - BLOCK1(0x80); - BLOCK1(0x1b); - BLOCK1(0x36); - rkeys[idx++] = temp0; -} - -void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(11 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 11 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenc_si128(temp, rkeys[4]); - temp = _mm_aesenc_si128(temp, rkeys[5]); - temp = _mm_aesenc_si128(temp, rkeys[6]); - temp = _mm_aesenc_si128(temp, rkeys[7]); - temp = _mm_aesenc_si128(temp, rkeys[8]); - temp = _mm_aesenc_si128(temp, rkeys[9]); - temp = _mm_aesenclast_si128(temp, rkeys[10]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - AESNENCX4(4); - AESNENCX4(5); - AESNENCX4(6); - AESNENCX4(7); - AESNENCX4(8); - AESNENCX4(9); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[10]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[10]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[10]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[10]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - // block = _mm_xor_si128(block, block); // set to zero - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128_load_schedule_ni(input, &schedule); - oqs_aes128_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128_free_schedule_ni(schedule); - return (int)outputByteLen; -} - -// 4-Round AES... - -// From crypto_core/aes128ncrypt/dolbeau/aesenc-int -static inline void aes128r4ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[5]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - - /* blockshift-based block by Cedric Bourrasset */ -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - rkeys[idx++] = temp0; -} - -void oqs_aes128r4_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(5 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128r4ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128r4_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 5 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128r4ni_encrypt(const __m128i rkeys[5], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenclast_si128(temp, rkeys[4]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128r4ni_encrypt_x4(const __m128i rkeys[5], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[4]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[4]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[4]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[4]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128r4_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128r4ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128r4ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128r4ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128r4_load_schedule_ni(input, &schedule); - oqs_aes128r4_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128r4_free_schedule_ni(schedule); - return (int)outputByteLen; -} -#endif \ No newline at end of file diff --git a/src/sig/mayo/pqmayo_mayo_1_opt/aes128ctr.c b/src/sig/mayo/pqmayo_mayo_1_opt/aes128ctr.c deleted file mode 100644 index 9382136337..0000000000 --- a/src/sig/mayo/pqmayo_mayo_1_opt/aes128ctr.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 and MIT and Public Domain - -#ifdef ENABLE_AESNI - -#include -#include -#include -#include -#include - -// Adapted from liboqs/src/common/aes which in turn takes it from: -// crypto_core/aes128ncrypt/dolbeau/aesenc-int -// (https://bench.cr.yp.to/supercop.html) -static inline void aes128ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[11]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - BLOCK1(0x10); - BLOCK1(0x20); - BLOCK1(0x40); - BLOCK1(0x80); - BLOCK1(0x1b); - BLOCK1(0x36); - rkeys[idx++] = temp0; -} - -void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(11 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 11 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenc_si128(temp, rkeys[4]); - temp = _mm_aesenc_si128(temp, rkeys[5]); - temp = _mm_aesenc_si128(temp, rkeys[6]); - temp = _mm_aesenc_si128(temp, rkeys[7]); - temp = _mm_aesenc_si128(temp, rkeys[8]); - temp = _mm_aesenc_si128(temp, rkeys[9]); - temp = _mm_aesenclast_si128(temp, rkeys[10]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - AESNENCX4(4); - AESNENCX4(5); - AESNENCX4(6); - AESNENCX4(7); - AESNENCX4(8); - AESNENCX4(9); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[10]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[10]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[10]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[10]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - // block = _mm_xor_si128(block, block); // set to zero - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128_load_schedule_ni(input, &schedule); - oqs_aes128_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128_free_schedule_ni(schedule); - return (int)outputByteLen; -} - -// 4-Round AES... - -// From crypto_core/aes128ncrypt/dolbeau/aesenc-int -static inline void aes128r4ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[5]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - - /* blockshift-based block by Cedric Bourrasset */ -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - rkeys[idx++] = temp0; -} - -void oqs_aes128r4_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(5 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128r4ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128r4_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 5 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128r4ni_encrypt(const __m128i rkeys[5], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenclast_si128(temp, rkeys[4]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128r4ni_encrypt_x4(const __m128i rkeys[5], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[4]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[4]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[4]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[4]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128r4_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128r4ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128r4ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128r4ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128r4_load_schedule_ni(input, &schedule); - oqs_aes128r4_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128r4_free_schedule_ni(schedule); - return (int)outputByteLen; -} -#endif \ No newline at end of file diff --git a/src/sig/mayo/pqmayo_mayo_2_avx2/aes128ctr.c b/src/sig/mayo/pqmayo_mayo_2_avx2/aes128ctr.c deleted file mode 100644 index 9382136337..0000000000 --- a/src/sig/mayo/pqmayo_mayo_2_avx2/aes128ctr.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 and MIT and Public Domain - -#ifdef ENABLE_AESNI - -#include -#include -#include -#include -#include - -// Adapted from liboqs/src/common/aes which in turn takes it from: -// crypto_core/aes128ncrypt/dolbeau/aesenc-int -// (https://bench.cr.yp.to/supercop.html) -static inline void aes128ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[11]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - BLOCK1(0x10); - BLOCK1(0x20); - BLOCK1(0x40); - BLOCK1(0x80); - BLOCK1(0x1b); - BLOCK1(0x36); - rkeys[idx++] = temp0; -} - -void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(11 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 11 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenc_si128(temp, rkeys[4]); - temp = _mm_aesenc_si128(temp, rkeys[5]); - temp = _mm_aesenc_si128(temp, rkeys[6]); - temp = _mm_aesenc_si128(temp, rkeys[7]); - temp = _mm_aesenc_si128(temp, rkeys[8]); - temp = _mm_aesenc_si128(temp, rkeys[9]); - temp = _mm_aesenclast_si128(temp, rkeys[10]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - AESNENCX4(4); - AESNENCX4(5); - AESNENCX4(6); - AESNENCX4(7); - AESNENCX4(8); - AESNENCX4(9); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[10]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[10]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[10]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[10]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - // block = _mm_xor_si128(block, block); // set to zero - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128_load_schedule_ni(input, &schedule); - oqs_aes128_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128_free_schedule_ni(schedule); - return (int)outputByteLen; -} - -// 4-Round AES... - -// From crypto_core/aes128ncrypt/dolbeau/aesenc-int -static inline void aes128r4ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[5]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - - /* blockshift-based block by Cedric Bourrasset */ -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - rkeys[idx++] = temp0; -} - -void oqs_aes128r4_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(5 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128r4ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128r4_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 5 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128r4ni_encrypt(const __m128i rkeys[5], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenclast_si128(temp, rkeys[4]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128r4ni_encrypt_x4(const __m128i rkeys[5], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[4]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[4]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[4]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[4]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128r4_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128r4ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128r4ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128r4ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128r4_load_schedule_ni(input, &schedule); - oqs_aes128r4_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128r4_free_schedule_ni(schedule); - return (int)outputByteLen; -} -#endif \ No newline at end of file diff --git a/src/sig/mayo/pqmayo_mayo_2_opt/aes128ctr.c b/src/sig/mayo/pqmayo_mayo_2_opt/aes128ctr.c deleted file mode 100644 index 9382136337..0000000000 --- a/src/sig/mayo/pqmayo_mayo_2_opt/aes128ctr.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 and MIT and Public Domain - -#ifdef ENABLE_AESNI - -#include -#include -#include -#include -#include - -// Adapted from liboqs/src/common/aes which in turn takes it from: -// crypto_core/aes128ncrypt/dolbeau/aesenc-int -// (https://bench.cr.yp.to/supercop.html) -static inline void aes128ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[11]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - BLOCK1(0x10); - BLOCK1(0x20); - BLOCK1(0x40); - BLOCK1(0x80); - BLOCK1(0x1b); - BLOCK1(0x36); - rkeys[idx++] = temp0; -} - -void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(11 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 11 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenc_si128(temp, rkeys[4]); - temp = _mm_aesenc_si128(temp, rkeys[5]); - temp = _mm_aesenc_si128(temp, rkeys[6]); - temp = _mm_aesenc_si128(temp, rkeys[7]); - temp = _mm_aesenc_si128(temp, rkeys[8]); - temp = _mm_aesenc_si128(temp, rkeys[9]); - temp = _mm_aesenclast_si128(temp, rkeys[10]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - AESNENCX4(4); - AESNENCX4(5); - AESNENCX4(6); - AESNENCX4(7); - AESNENCX4(8); - AESNENCX4(9); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[10]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[10]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[10]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[10]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - // block = _mm_xor_si128(block, block); // set to zero - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128_load_schedule_ni(input, &schedule); - oqs_aes128_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128_free_schedule_ni(schedule); - return (int)outputByteLen; -} - -// 4-Round AES... - -// From crypto_core/aes128ncrypt/dolbeau/aesenc-int -static inline void aes128r4ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[5]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - - /* blockshift-based block by Cedric Bourrasset */ -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - rkeys[idx++] = temp0; -} - -void oqs_aes128r4_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(5 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128r4ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128r4_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 5 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128r4ni_encrypt(const __m128i rkeys[5], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenclast_si128(temp, rkeys[4]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128r4ni_encrypt_x4(const __m128i rkeys[5], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[4]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[4]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[4]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[4]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128r4_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128r4ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128r4ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128r4ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128r4_load_schedule_ni(input, &schedule); - oqs_aes128r4_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128r4_free_schedule_ni(schedule); - return (int)outputByteLen; -} -#endif \ No newline at end of file diff --git a/src/sig/mayo/pqmayo_mayo_3_avx2/aes128ctr.c b/src/sig/mayo/pqmayo_mayo_3_avx2/aes128ctr.c deleted file mode 100644 index 9382136337..0000000000 --- a/src/sig/mayo/pqmayo_mayo_3_avx2/aes128ctr.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 and MIT and Public Domain - -#ifdef ENABLE_AESNI - -#include -#include -#include -#include -#include - -// Adapted from liboqs/src/common/aes which in turn takes it from: -// crypto_core/aes128ncrypt/dolbeau/aesenc-int -// (https://bench.cr.yp.to/supercop.html) -static inline void aes128ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[11]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - BLOCK1(0x10); - BLOCK1(0x20); - BLOCK1(0x40); - BLOCK1(0x80); - BLOCK1(0x1b); - BLOCK1(0x36); - rkeys[idx++] = temp0; -} - -void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(11 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 11 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenc_si128(temp, rkeys[4]); - temp = _mm_aesenc_si128(temp, rkeys[5]); - temp = _mm_aesenc_si128(temp, rkeys[6]); - temp = _mm_aesenc_si128(temp, rkeys[7]); - temp = _mm_aesenc_si128(temp, rkeys[8]); - temp = _mm_aesenc_si128(temp, rkeys[9]); - temp = _mm_aesenclast_si128(temp, rkeys[10]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - AESNENCX4(4); - AESNENCX4(5); - AESNENCX4(6); - AESNENCX4(7); - AESNENCX4(8); - AESNENCX4(9); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[10]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[10]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[10]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[10]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - // block = _mm_xor_si128(block, block); // set to zero - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128_load_schedule_ni(input, &schedule); - oqs_aes128_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128_free_schedule_ni(schedule); - return (int)outputByteLen; -} - -// 4-Round AES... - -// From crypto_core/aes128ncrypt/dolbeau/aesenc-int -static inline void aes128r4ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[5]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - - /* blockshift-based block by Cedric Bourrasset */ -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - rkeys[idx++] = temp0; -} - -void oqs_aes128r4_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(5 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128r4ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128r4_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 5 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128r4ni_encrypt(const __m128i rkeys[5], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenclast_si128(temp, rkeys[4]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128r4ni_encrypt_x4(const __m128i rkeys[5], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[4]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[4]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[4]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[4]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128r4_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128r4ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128r4ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128r4ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128r4_load_schedule_ni(input, &schedule); - oqs_aes128r4_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128r4_free_schedule_ni(schedule); - return (int)outputByteLen; -} -#endif \ No newline at end of file diff --git a/src/sig/mayo/pqmayo_mayo_3_opt/aes128ctr.c b/src/sig/mayo/pqmayo_mayo_3_opt/aes128ctr.c deleted file mode 100644 index 9382136337..0000000000 --- a/src/sig/mayo/pqmayo_mayo_3_opt/aes128ctr.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 and MIT and Public Domain - -#ifdef ENABLE_AESNI - -#include -#include -#include -#include -#include - -// Adapted from liboqs/src/common/aes which in turn takes it from: -// crypto_core/aes128ncrypt/dolbeau/aesenc-int -// (https://bench.cr.yp.to/supercop.html) -static inline void aes128ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[11]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - BLOCK1(0x10); - BLOCK1(0x20); - BLOCK1(0x40); - BLOCK1(0x80); - BLOCK1(0x1b); - BLOCK1(0x36); - rkeys[idx++] = temp0; -} - -void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(11 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 11 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenc_si128(temp, rkeys[4]); - temp = _mm_aesenc_si128(temp, rkeys[5]); - temp = _mm_aesenc_si128(temp, rkeys[6]); - temp = _mm_aesenc_si128(temp, rkeys[7]); - temp = _mm_aesenc_si128(temp, rkeys[8]); - temp = _mm_aesenc_si128(temp, rkeys[9]); - temp = _mm_aesenclast_si128(temp, rkeys[10]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - AESNENCX4(4); - AESNENCX4(5); - AESNENCX4(6); - AESNENCX4(7); - AESNENCX4(8); - AESNENCX4(9); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[10]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[10]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[10]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[10]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - // block = _mm_xor_si128(block, block); // set to zero - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128_load_schedule_ni(input, &schedule); - oqs_aes128_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128_free_schedule_ni(schedule); - return (int)outputByteLen; -} - -// 4-Round AES... - -// From crypto_core/aes128ncrypt/dolbeau/aesenc-int -static inline void aes128r4ni_setkey_encrypt(const unsigned char *key, - __m128i rkeys[5]) { - __m128i key0 = _mm_loadu_si128((const __m128i *)(key + 0)); - __m128i temp0, temp1, temp4; - int idx = 0; - - temp0 = key0; - - /* blockshift-based block by Cedric Bourrasset */ -#define BLOCK1(IMM) \ - temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ - rkeys[idx++] = temp0; \ - temp4 = _mm_slli_si128(temp0, 4); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp4 = _mm_slli_si128(temp0, 8); \ - temp0 = _mm_xor_si128(temp0, temp4); \ - temp1 = _mm_shuffle_epi32(temp1, 0xff); \ - temp0 = _mm_xor_si128(temp0, temp1) - - BLOCK1(0x01); - BLOCK1(0x02); - BLOCK1(0x04); - BLOCK1(0x08); - rkeys[idx++] = temp0; -} - -void oqs_aes128r4_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(5 * sizeof(__m128i)); - // assert(*_schedule != NULL); - __m128i *schedule = (__m128i *)*_schedule; - aes128r4ni_setkey_encrypt(key, schedule); -} - -void oqs_aes128r4_free_schedule_ni(void *schedule) { - if (schedule != NULL) { - mayo_secure_free(schedule, 5 * sizeof(__m128i)); - } -} - -// Single encryption -static inline void aes128r4ni_encrypt(const __m128i rkeys[5], __m128i nv, - unsigned char *out) { - __m128i temp = _mm_xor_si128(nv, rkeys[0]); - temp = _mm_aesenc_si128(temp, rkeys[1]); - temp = _mm_aesenc_si128(temp, rkeys[2]); - temp = _mm_aesenc_si128(temp, rkeys[3]); - temp = _mm_aesenclast_si128(temp, rkeys[4]); - _mm_storeu_si128((__m128i *)(out), temp); -} - -// 4x interleaved encryption -static inline void aes128r4ni_encrypt_x4(const __m128i rkeys[5], __m128i n0, - __m128i n1, __m128i n2, __m128i n3, - unsigned char *out) { - __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); - -#define AESNENCX4(IDX) \ - temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ - temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ - temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ - temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) - - AESNENCX4(1); - AESNENCX4(2); - AESNENCX4(3); - - temp0 = _mm_aesenclast_si128(temp0, rkeys[4]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[4]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[4]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[4]); - - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); -} - -// Not for general use: IV = 0, nonce = 0 -static void oqs_aes128r4_ctr_enc_sch_ni(const void *schedule, uint8_t *out, - size_t out_len) { - __m128i mask = - _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); - __m128i block = _mm_set_epi64x(0, 0); - - while (out_len >= 64) { - __m128i nv0 = block; - __m128i nv1 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - __m128i nv2 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), - mask); - __m128i nv3 = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), - mask); - aes128r4ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), - mask); - out += 64; - out_len -= 64; - } - while (out_len >= 16) { - aes128r4ni_encrypt(schedule, block, out); - out += 16; - out_len -= 16; - block = _mm_shuffle_epi8( - _mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), - mask); - } - if (out_len > 0) { - uint8_t tmp[16]; - aes128r4ni_encrypt(schedule, block, tmp); - memcpy(out, tmp, out_len); - } -} - -int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, - const unsigned char *input, size_t inputByteLen) { - void *schedule = NULL; - oqs_aes128r4_load_schedule_ni(input, &schedule); - oqs_aes128r4_ctr_enc_sch_ni(schedule, output, outputByteLen); - oqs_aes128r4_free_schedule_ni(schedule); - return (int)outputByteLen; -} -#endif \ No newline at end of file