From 695dc30deca7b9f8919cf05e0fc0f8045a1bbe01 Mon Sep 17 00:00:00 2001 From: Duc Nguyen Date: Mon, 11 Sep 2023 23:15:12 -0400 Subject: [PATCH] fix falcon-1024 --- .../pqclean_falcon-1024_aarch64/poly_int.c | 70 ++++++++++--------- .../pqclean_falcon-512_aarch64/poly_int.c | 18 ++--- 2 files changed, 45 insertions(+), 43 deletions(-) diff --git a/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c index cf1cd3828e..cfccc2afcd 100644 --- a/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c +++ b/src/sig/falcon/pqclean_falcon-1024_aarch64/poly_int.c @@ -198,13 +198,14 @@ uint16_t PQCLEAN_FALCON1024_AARCH64_poly_compare_with_zero(int16_t f[FALCON_N]) * If coefficient is larger than Q, it is subtracted with Q */ void PQCLEAN_FALCON1024_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) { - // Total SIMD registers: 26 = 8 + 16 + 2 + // Total SIMD registers: 26 = 8 + 16 + 1 + 1 uint16x8x4_t b0, b1; // 8 int16x8x4_t a0, a1, c0, c1; // 16 - int16x8_t neon_q, neon_2q; // 2 + int16x8_t neon_q; // 1 + uint16x8_t neon_2q; // 1 neon_q = vdupq_n_s16(FALCON_Q); - neon_2q = vdupq_n_s16(FALCON_Q << 1); + neon_2q = vdupq_n_u16(FALCON_Q << 1); for (int i = 0; i < FALCON_N; i += 64) { vload_s16_x4(a0, &f[i]); @@ -222,15 +223,15 @@ void PQCLEAN_FALCON1024_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) { b1.val[2] = vcltzq_s16(a1.val[2]); b1.val[3] = vcltzq_s16(a1.val[3]); - c0.val[0] = vandq_s16(b0.val[0], neon_2q); - c0.val[1] = vandq_s16(b0.val[1], neon_2q); - c0.val[2] = vandq_s16(b0.val[2], neon_2q); - c0.val[3] = vandq_s16(b0.val[3], neon_2q); + c0.val[0] = vreinterpretq_s16_u16(vandq_u16(b0.val[0], neon_2q)); + c0.val[1] = vreinterpretq_s16_u16(vandq_u16(b0.val[1], neon_2q)); + c0.val[2] = vreinterpretq_s16_u16(vandq_u16(b0.val[2], neon_2q)); + c0.val[3] = vreinterpretq_s16_u16(vandq_u16(b0.val[3], neon_2q)); - c1.val[0] = vandq_s16(b1.val[0], neon_2q); - c1.val[1] = vandq_s16(b1.val[1], neon_2q); - c1.val[2] = vandq_s16(b1.val[2], neon_2q); - c1.val[3] = vandq_s16(b1.val[3], neon_2q); + c1.val[0] = vreinterpretq_s16_u16(vandq_u16(b1.val[0], neon_2q)); + c1.val[1] = vreinterpretq_s16_u16(vandq_u16(b1.val[1], neon_2q)); + c1.val[2] = vreinterpretq_s16_u16(vandq_u16(b1.val[2], neon_2q)); + c1.val[3] = vreinterpretq_s16_u16(vandq_u16(b1.val[3], neon_2q)); vadd_x4(a0, a0, c0); vadd_x4(a1, a1, c1); @@ -271,17 +272,18 @@ void PQCLEAN_FALCON1024_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) { */ int PQCLEAN_FALCON1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int16_t t[FALCON_N]) { // Total SIMD registers: 32 - int16x8x4_t a, f; // 8 - uint16x8x4_t c0, c1, d0, d1; // 16 - uint16x8x2_t e; // 2 - int8x16x4_t g; // 4 - int16x8_t neon_127, neon__127, neon_q_2, neon__q_2, neon_q; // 5 + int16x8x4_t a, f; // 8 + uint16x8x4_t c0, c1, d0, d1; // 16 + uint16x8x2_t e; // 2 + int8x16x4_t g; // 4 + int16x8_t neon_127, neon__127, neon_q_2, neon__q_2; // 4 + uint16x8_t neon_q; // 1 neon_127 = vdupq_n_s16(127); neon__127 = vdupq_n_s16(-127); - neon_q = vdupq_n_s16(FALCON_Q); neon_q_2 = vdupq_n_s16(FALCON_Q >> 1); neon__q_2 = vdupq_n_s16(-(FALCON_Q >> 1)); + neon_q = vdupq_n_u16(FALCON_Q); e.val[1] = vdupq_n_u16(0); for (int i = 0; i < FALCON_N; i += 64) { @@ -301,15 +303,15 @@ int PQCLEAN_FALCON1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int1 c1.val[3] = vcgeq_s16(f.val[3], neon_q_2); // Perform subtraction with Q - c0.val[0] = vandq_s16(vreinterpretq_s16_u16(c0.val[0]), neon_q); - c0.val[1] = vandq_s16(vreinterpretq_s16_u16(c0.val[1]), neon_q); - c0.val[2] = vandq_s16(vreinterpretq_s16_u16(c0.val[2]), neon_q); - c0.val[3] = vandq_s16(vreinterpretq_s16_u16(c0.val[3]), neon_q); + c0.val[0] = vandq_u16(c0.val[0], neon_q); + c0.val[1] = vandq_u16(c0.val[1], neon_q); + c0.val[2] = vandq_u16(c0.val[2], neon_q); + c0.val[3] = vandq_u16(c0.val[3], neon_q); - c1.val[0] = vandq_s16(vreinterpretq_s16_u16(c1.val[0]), neon_q); - c1.val[1] = vandq_s16(vreinterpretq_s16_u16(c1.val[1]), neon_q); - c1.val[2] = vandq_s16(vreinterpretq_s16_u16(c1.val[2]), neon_q); - c1.val[3] = vandq_s16(vreinterpretq_s16_u16(c1.val[3]), neon_q); + c1.val[0] = vandq_u16(c1.val[0], neon_q); + c1.val[1] = vandq_u16(c1.val[1], neon_q); + c1.val[2] = vandq_u16(c1.val[2], neon_q); + c1.val[3] = vandq_u16(c1.val[3], neon_q); vsub_x4(a, a, c0); vsub_x4(f, f, c1); @@ -326,15 +328,15 @@ int PQCLEAN_FALCON1024_AARCH64_poly_int16_to_int8(int8_t G[FALCON_N], const int1 d1.val[3] = vcgtq_s16(neon__q_2, f.val[3]); // Perform addition with Q - d0.val[0] = vandq_s16(vreinterpretq_s16_u16(d0.val[0]), neon_q); - d0.val[1] = vandq_s16(vreinterpretq_s16_u16(d0.val[1]), neon_q); - d0.val[2] = vandq_s16(vreinterpretq_s16_u16(d0.val[2]), neon_q); - d0.val[3] = vandq_s16(vreinterpretq_s16_u16(d0.val[3]), neon_q); - - d1.val[0] = vandq_s16(vreinterpretq_s16_u16(d1.val[0]), neon_q); - d1.val[1] = vandq_s16(vreinterpretq_s16_u16(d1.val[1]), neon_q); - d1.val[2] = vandq_s16(vreinterpretq_s16_u16(d1.val[2]), neon_q); - d1.val[3] = vandq_s16(vreinterpretq_s16_u16(d1.val[3]), neon_q); + d0.val[0] = vandq_s16(d0.val[0], neon_q); + d0.val[1] = vandq_s16(d0.val[1], neon_q); + d0.val[2] = vandq_s16(d0.val[2], neon_q); + d0.val[3] = vandq_s16(d0.val[3], neon_q); + + d1.val[0] = vandq_s16(d1.val[0], neon_q); + d1.val[1] = vandq_s16(d1.val[1], neon_q); + d1.val[2] = vandq_s16(d1.val[2], neon_q); + d1.val[3] = vandq_s16(d1.val[3], neon_q); vadd_x4(a, a, d0); vadd_x4(f, f, d1); diff --git a/src/sig/falcon/pqclean_falcon-512_aarch64/poly_int.c b/src/sig/falcon/pqclean_falcon-512_aarch64/poly_int.c index a5033f8643..c52deca248 100644 --- a/src/sig/falcon/pqclean_falcon-512_aarch64/poly_int.c +++ b/src/sig/falcon/pqclean_falcon-512_aarch64/poly_int.c @@ -223,15 +223,15 @@ void PQCLEAN_FALCON512_AARCH64_poly_convert_to_unsigned(int16_t f[FALCON_N]) { b1.val[2] = vcltzq_s16(a1.val[2]); b1.val[3] = vcltzq_s16(a1.val[3]); - c0.val[0] = (int16x8_t) vandq_u16(b0.val[0], neon_2q); - c0.val[1] = (int16x8_t) vandq_u16(b0.val[1], neon_2q); - c0.val[2] = (int16x8_t) vandq_u16(b0.val[2], neon_2q); - c0.val[3] = (int16x8_t) vandq_u16(b0.val[3], neon_2q); - - c1.val[0] = (int16x8_t) vandq_u16(b1.val[0], neon_2q); - c1.val[1] = (int16x8_t) vandq_u16(b1.val[1], neon_2q); - c1.val[2] = (int16x8_t) vandq_u16(b1.val[2], neon_2q); - c1.val[3] = (int16x8_t) vandq_u16(b1.val[3], neon_2q); + c0.val[0] = vreinterpretq_s16_u16(vandq_u16(b0.val[0], neon_2q)); + c0.val[1] = vreinterpretq_s16_u16(vandq_u16(b0.val[1], neon_2q)); + c0.val[2] = vreinterpretq_s16_u16(vandq_u16(b0.val[2], neon_2q)); + c0.val[3] = vreinterpretq_s16_u16(vandq_u16(b0.val[3], neon_2q)); + + c1.val[0] = vreinterpretq_s16_u16(vandq_u16(b1.val[0], neon_2q)); + c1.val[1] = vreinterpretq_s16_u16(vandq_u16(b1.val[1], neon_2q)); + c1.val[2] = vreinterpretq_s16_u16(vandq_u16(b1.val[2], neon_2q)); + c1.val[3] = vreinterpretq_s16_u16(vandq_u16(b1.val[3], neon_2q)); vadd_x4(a0, a0, c0); vadd_x4(a1, a1, c1);