diff --git a/code/jasmin/mlkem_avx2/poly.jinc b/code/jasmin/mlkem_avx2/poly.jinc index 409e8900..0b962871 100644 --- a/code/jasmin/mlkem_avx2/poly.jinc +++ b/code/jasmin/mlkem_avx2/poly.jinc @@ -685,132 +685,6 @@ fn __poly_cbd_eta2(reg ptr u16[MLKEM_N] rp, reg ptr u8[MLKEM_ETA2*MLKEM_N/4] buf return rp; } -/* -#[returnaddress="stack"] -fn _poly_getnoise(reg ptr u16[MLKEM_N] rp, reg ptr u8[MLKEM_SYMBYTES] seed, reg u8 nonce) -> reg ptr u16[MLKEM_N] -{ - inline int i; - reg u256 f0 f1 f2 f3; - reg u256 mask55 mask33 mask03 mask0F; - reg u128 t; - reg u64 t64; - stack ptr u16[MLKEM_N] srp; - stack u8[128] buf; - stack u8[33] extseed; - stack u32 mask55_s mask33_s mask03_s mask0F_s; - - mask55_s = 0x55555555; - mask33_s = 0x33333333; - mask03_s = 0x03030303; - mask0F_s = 0x0F0F0F0F; - - srp = rp; - - for i=0 to MLKEM_SYMBYTES/8 - { - t64 = seed[u64 i]; - extseed[u64 i] = t64; - } - extseed[MLKEM_SYMBYTES] = nonce; - - buf = _shake256_128_33(buf, extseed); - - mask55 = #VPBROADCAST_8u32(mask55_s); - mask33 = #VPBROADCAST_8u32(mask33_s); - mask03 = #VPBROADCAST_8u32(mask03_s); - mask0F = #VPBROADCAST_8u32(mask0F_s); - - rp = srp; - - for i=0 to MLKEM_N/64 - { - f0 = buf[u256 i]; - - f1 = #VPSRL_16u16(f0, 1); - f0 = #VPAND_256(mask55, f0); - f1 = #VPAND_256(mask55, f1); - f0 = #VPADD_32u8(f0, f1); - - f1 = #VPSRL_16u16(f0, 2); - f0 = #VPAND_256(mask33, f0); - f1 = #VPAND_256(mask33, f1); - f0 = #VPADD_32u8(f0, mask33); - f0 = #VPSUB_32u8(f0, f1); - - f1 = #VPSRL_16u16(f0, 4); - f0 = #VPAND_256(mask0F, f0); - f1 = #VPAND_256(mask0F, f1); - f0 = #VPSUB_32u8(f0, mask03); - f1 = #VPSUB_32u8(f1, mask03); - - f2 = #VPUNPCKL_32u8(f0, f1); - f3 = #VPUNPCKH_32u8(f0, f1); - - t = (128u)f2; - f0 = #VPMOVSX_16u8_16u16(t); - t = #VEXTRACTI128(f2, 1); - f1 = #VPMOVSX_16u8_16u16(t); - t = (128u)f3; - f2 = #VPMOVSX_16u8_16u16(t); - t = #VEXTRACTI128(f3, 1); - f3 = #VPMOVSX_16u8_16u16(t); - rp[u256 4*i] = f0; - rp[u256 4*i + 1] = f2; - rp[u256 4*i + 2] = f1; - rp[u256 4*i + 3] = f3; - } - - return rp; -} -*/ - -/* OLD_KECCAK -inline -fn __shake256_squeezenblocks4x(reg ptr u256[25] state, reg ptr u8[NOISE_NBLOCKS * SHAKE256_RATE] buf0 buf1 buf2 buf3) -> reg ptr u256[25], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE], reg ptr u8[NOISE_NBLOCKS*SHAKE256_RATE] -{ - inline int i; - - for i = 0 to NOISE_NBLOCKS - { - state, buf0[i*SHAKE256_RATE:SHAKE256_RATE], buf1[i*SHAKE256_RATE:SHAKE256_RATE], buf2[i*SHAKE256_RATE:SHAKE256_RATE], buf3[i*SHAKE256_RATE:SHAKE256_RATE] = __shake256_squeezeblock4x(state, buf0[i*SHAKE256_RATE:SHAKE256_RATE], buf1[i*SHAKE256_RATE:SHAKE256_RATE], buf2[i*SHAKE256_RATE:SHAKE256_RATE], buf3[i*SHAKE256_RATE:SHAKE256_RATE]); - } - - return state, buf0, buf1, buf2, buf3; -} - -#[returnaddress="stack"] -fn _poly_getnoise_eta1_4x(reg ptr u16[MLKEM_N] r0 r1 r2 r3, reg ptr u8[MLKEM_SYMBYTES] seed, reg u8 nonce) -> reg ptr u16[MLKEM_N], reg ptr u16[MLKEM_N], reg ptr u16[MLKEM_N], reg ptr u16[MLKEM_N] -{ - reg u256 f; - stack u256[25] state; - stack u8[NOISE_NBLOCKS * SHAKE256_RATE] buf0 buf1 buf2 buf3; - - f = seed[u256 0]; - buf0[u256 0] = f; - buf1[u256 0] = f; - buf2[u256 0] = f; - buf3[u256 0] = f; - - buf0.[32] = nonce; - nonce += 1; - buf1.[32] = nonce; - nonce += 1; - buf2.[32] = nonce; - nonce += 1; - buf3.[32] = nonce; - - state = _shake256_absorb4x_33(state, buf0[0:33], buf1[0:33], buf2[0:33], buf3[0:33]); - state, buf0, buf1, buf2, buf3 = __shake256_squeezenblocks4x(state, buf0, buf1, buf2, buf3); - - r0 = __poly_cbd_eta1(r0, buf0[0:MLKEM_ETA1*MLKEM_N/4+(MLKEM_ETA1 - 2)*8]); - r1 = __poly_cbd_eta1(r1, buf1[0:MLKEM_ETA1*MLKEM_N/4+(MLKEM_ETA1 - 2)*8]); - r2 = __poly_cbd_eta1(r2, buf2[0:MLKEM_ETA1*MLKEM_N/4+(MLKEM_ETA1 - 2)*8]); - r3 = __poly_cbd_eta1(r3, buf3[0:MLKEM_ETA1*MLKEM_N/4+(MLKEM_ETA1 - 2)*8]); - - return r0, r1, r2, r3; -} -*/ - #[returnaddress="stack"] fn _poly_getnoise_eta1_4x ( reg ptr u16[MLKEM_N] r0 r1 r2 r3 @@ -849,40 +723,6 @@ fn _poly_getnoise_eta1_4x return r0, r1, r2, r3; } -/* OLD_KECCAK -#[returnaddress="stack"] -fn _poly_getnoise_eta1122_4x(reg ptr u16[MLKEM_N] r0 r1 r2 r3, reg ptr u8[MLKEM_SYMBYTES] seed, reg u8 nonce) -> reg ptr u16[MLKEM_N], reg ptr u16[MLKEM_N], reg ptr u16[MLKEM_N], reg ptr u16[MLKEM_N] -{ - reg u256 f; - stack u256[25] state; - stack u8[NOISE_NBLOCKS * SHAKE256_RATE] buf0 buf1 buf2 buf3; - - f = seed[u256 0]; - buf0[u256 0] = f; - buf1[u256 0] = f; - buf2[u256 0] = f; - buf3[u256 0] = f; - - buf0.[32] = nonce; - nonce += 1; - buf1.[32] = nonce; - nonce += 1; - buf2.[32] = nonce; - nonce += 1; - buf3.[32] = nonce; - - state = _shake256_absorb4x_33(state, buf0[0:33], buf1[0:33], buf2[0:33], buf3[0:33]); - state, buf0, buf1, buf2, buf3 = __shake256_squeezenblocks4x(state, buf0, buf1, buf2, buf3); - - r0 = __poly_cbd_eta1(r0, buf0[0:MLKEM_ETA1*MLKEM_N/4+(MLKEM_ETA1 - 2)*8]); - r1 = __poly_cbd_eta1(r1, buf1[0:MLKEM_ETA1*MLKEM_N/4+(MLKEM_ETA1 - 2)*8]); - r2 = __poly_cbd_eta2(r2, buf2[0:MLKEM_ETA2*MLKEM_N/4]); - r3 = __poly_cbd_eta2(r3, buf3[0:MLKEM_ETA2*MLKEM_N/4]); - - return r0, r1, r2, r3; -} -*/ - #[returnaddress="stack"] fn _poly_getnoise_eta1122_4x ( reg ptr u16[MLKEM_N] r0 r1 r2 r3 @@ -1129,30 +969,22 @@ fn __butterfly64x(reg u256 rl0 rl1 rl2 rl3 rh0 rh1 rh2 rh3 zl0 zl1 zh0 zh1 qx16) t4 = #VPMULH_16u16(t4, qx16); t6 = #VPMULH_16u16(t6, qx16); - //rh1 = #VPSUB_16u16(t3, rl1); rh1 = #VPSUB_16u16(rl1, t3); rl1 = #VPADD_16u16(t3, rl1); - //rh0 = #VPSUB_16u16(t1, rl0); rh0 = #VPSUB_16u16(rl0, t1); rl0 = #VPADD_16u16(t1, rl0); - //rh3 = #VPSUB_16u16(t7, rl3); rh3 = #VPSUB_16u16(rl3, t7); rl3 = #VPADD_16u16(t7, rl3); - //rh2 = #VPSUB_16u16(t5, rl2); rh2 = #VPSUB_16u16(rl2, t5); rl2 = #VPADD_16u16(t5, rl2); rh0 = #VPADD_16u16(t0, rh0); - //rl0 = #VPSUB_16u16(t0, rl0); rl0 = #VPSUB_16u16(rl0, t0); rh1 = #VPADD_16u16(t2, rh1); - //rl1 = #VPSUB_16u16(t2, rl1); rl1 = #VPSUB_16u16(rl1, t2); rh2 = #VPADD_16u16(t4, rh2); - //rl2 = #VPSUB_16u16(t4, rl2); rl2 = #VPSUB_16u16(rl2, t4); rh3 = #VPADD_16u16(t6, rh3); - //rl3 = #VPSUB_16u16(t6, rl3); rl3 = #VPSUB_16u16(rl3, t6); return rl0, rl1, rl2, rl3, rh0, rh1, rh2, rh3; @@ -1201,12 +1033,6 @@ fn _poly_ntt(reg ptr u16[MLKEM_N] rp) -> reg ptr u16[MLKEM_N] r0, r1, r2, r3, r4, r5, r6, r7 = __butterfly64x(r0, r1, r2, r3, r4, r5, r6, r7, zeta0, zeta0, zeta1, zeta1, qx16); - /* - rp.[u256 32*4] = r0; - rp.[u256 32*5] = r1; - rp.[u256 32*6] = r2; - rp.[u256 32*7] = r3; - */ rp.[u256 32*12] = r4; rp.[u256 32*13] = r5; rp.[u256 32*14] = r6;