diff --git a/.github/workflows/cross.yml b/.github/workflows/cross.yml index 48ab046..7b36c23 100644 --- a/.github/workflows/cross.yml +++ b/.github/workflows/cross.yml @@ -17,14 +17,13 @@ jobs: mips64-unknown-linux-gnuabi64, ] feature: [kyber512, kyber768, kyber1024] - opt: ["", 90s, "90s-fixslice"] steps: - uses: actions/checkout@v3 - - name: Cross Compile Tests ${{ matrix.target }} ${{ matrix.feature }} ${{ matrix.opt }} + - name: Cross Compile Tests ${{ matrix.target }} ${{ matrix.feature }} uses: actions-rs/cargo@v1.0.1 with: use-cross: true command: test - args: --target ${{ matrix.target }} --features "${{ matrix.feature }} ${{ matrix.opt }}" + args: --target ${{ matrix.target }} --features "${{ matrix.feature }}" diff --git a/.github/workflows/kat.yml b/.github/workflows/kat.yml index 504092a..ba93663 100644 --- a/.github/workflows/kat.yml +++ b/.github/workflows/kat.yml @@ -35,4 +35,4 @@ jobs: working-directory: ./tests run: | chmod +x run_all_tests.sh - KAT=1 AVX2=1 NASM=1 ./run_all_tests.sh \ No newline at end of file + KAT=1 ./run_all_tests.sh diff --git a/Cargo.toml b/Cargo.toml index 19f6a66..4305bc7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,11 +14,8 @@ readme = "readme.md" [dependencies] rand_core = { version = "0.6.4", default-features = false } wasm-bindgen = { version = "0.2.84", optional = true } -sha2 = { version = "0.10.6", optional = true , default-features = false } getrandom = {version = "0.2.9", features = ["js"], optional = true } zeroize = { version = "1.6.0", features = ["derive"], optional = true } -aes = { version = "0.8.2", optional = true } -ctr = { version = "0.9.2", optional = true } # Optional dev-deps, see https://github.com/rust-lang/cargo/issues/1596 criterion = { version = "0.4.0", features = ["html_reports"], optional = true } @@ -32,8 +29,6 @@ optional = true rand = "0.8.5" [build-dependencies] -cc = {version = "1.0.73", optional = true } -nasm-rs = {version = "0.2.4", optional = true } [lib] crate-type = ["cdylib", "rlib"] @@ -50,31 +45,9 @@ kyber512 = [] kyber768 = [] kyber1024 = [] -### Export IND-CPA primitives -# **WARNING** use with caution -hazmat = [] - -### Additional features ### -# 90s mode uses AES256-CTR and SHA2 as primitives instead -# Uses a bitslice implementation -90s = ["sha2"] - -# Fixslice RustCrypto AES implementation offers some additional sidechannel -# attack resistance. Suggest benchmarking for comparison. -90s-fixslice = ["90s", "aes", "ctr"] - -# Use avx2 intrinsics on x86 architectures -# Wont compile if the platform doesn't support it -avx2 = ["cc"] - # For compiling to wasm targets wasm = ["wasm-bindgen", "getrandom", "rand"] -# Uses Netwide Assembler avx2 code instead of GAS, this offers increased -# portability, you will need a nasm compiler installed. -# Can be downloaded from https://www.nasm.us/ -nasm = ["nasm-rs", "avx2"] - # Enable std library support std = [] diff --git a/benches/readme.md b/benches/readme.md index 7ab2358..24dc076 100644 --- a/benches/readme.md +++ b/benches/readme.md @@ -15,7 +15,7 @@ Don't be surprised to significant speedups and regressions. You will need to enable the benchmarking feature to run: ```bash -cargo bench --features "benchmarking kyber1024 avx2" +cargo bench --features "benchmarking kyber1024" ``` This is a workaround for issues with address sanitizer checks in the test suite. diff --git a/build.rs b/build.rs deleted file mode 100644 index c9c4c8b..0000000 --- a/build.rs +++ /dev/null @@ -1,41 +0,0 @@ -fn main() { - #[cfg(not(feature = "wasm"))] - { - #[cfg(feature = "avx2")] - { - - const FILES: [&str; 5] = ["basemul", "fq", "invntt", "ntt", "shuffle"]; - - #[cfg(feature = "nasm")] - { - const ROOT: &str = "src/avx2/nasm/"; - let paths = FILES.iter().map(|file| format!("{}{}.asm", ROOT, file)); - - let mut nasm = nasm_rs::Build::new(); - let mut linker = cc::Build::new(); - - nasm.files(paths); - nasm.include(ROOT); - - for o in nasm.compile_objects().expect(" - Compiling NASM files: - Ensure it is installed and in your path - https://www.nasm.us/" - ) { - linker.object(o); - } - linker.compile("safe_pqc_kyber"); - } - - #[cfg(not(feature = "nasm"))] - { - const ROOT: &str = "src/avx2/"; - let paths = FILES.iter().map(|file| format!("{}{}.S", ROOT, file)); - cc::Build::new() - .include(ROOT) - .files(paths) - .compile("safe_pqc_kyber"); - } - } - } -} \ No newline at end of file diff --git a/fuzz/readme.md b/fuzz/readme.md index e8a60f9..e4ecbee 100644 --- a/fuzz/readme.md +++ b/fuzz/readme.md @@ -28,11 +28,11 @@ cargo hfuzz run Run different security levels and modes: ```bash -cargo hfuzz run --features "kyber512 90s" +cargo hfuzz run --features "kyber512" ``` Current targets are: * keypair * encap -* decap \ No newline at end of file +* decap diff --git a/readme.md b/readme.md index 689010f..24bd176 100644 --- a/readme.md +++ b/readme.md @@ -17,7 +17,6 @@ A rust implementation of the Kyber algorithm, a KEM standardised by the NIST Pos This library: * Is no_std compatible and needs no allocator, suitable for embedded devices. * Reference files contain no unsafe code and are written in pure rust. -* On x86_64 platforms offers an avx2 optimized version, which includes assembly from the C reference repo. * Compiles to WASM using wasm-bindgen and has a ready-to-use binary published on NPM. @@ -43,12 +42,6 @@ cargo add safe_pqc_kyber use safe_pqc_kyber::*; ``` -For optimisations on x86 platforms enable the `avx2` feature and the following RUSTFLAGS: - -```shell -export RUSTFLAGS="-C target-feature=+aes,+avx2,+sse2,+sse4.1,+bmi2,+popcnt" -``` - --- ### Key Encapsulation @@ -134,7 +127,7 @@ If no security level is specified then kyber768 is used by default as recommende ```toml [dependencies] -safe_pqc_kyber = {version = "0.6.0", features = ["kyber512", "90s", "avx2"]} +safe_pqc_kyber = {version = "0.6.0", features = ["kyber512"]} ``` @@ -143,11 +136,7 @@ safe_pqc_kyber = {version = "0.6.0", features = ["kyber512", "90s", "avx2"]} | std | Enable the standard library | | kyber512 | Enables kyber512 mode, with a security level roughly equivalent to AES-128.| | kyber1024 | Enables kyber1024 mode, with a security level roughly equivalent to AES-256. A compile-time error is raised if more than one security level is specified.| -| 90s | Uses AES256 in counter mode and SHA2 as a replacement for SHAKE. This can provide hardware speedups in some cases.| -| 90s-fixslice | Uses a fixslice implementation of AES256 by RustCrypto, this provides greater side-channel attack resistance, especially on embedded platforms | -| avx2 | On x86_64 platforms enable the optimized version. This flag is will cause a compile error on other architectures. | | wasm | For compiling to WASM targets| -| nasm | Uses Netwide Assembler avx2 code instead of GAS for portability. Requires a nasm compiler: https://www.nasm.us/ | | zeroize | This will zero out the key exchange structs on drop using the [zeroize](https://docs.rs/zeroize/latest/zeroize/) crate | | benchmarking | Enables the criterion benchmarking suite | --- @@ -164,9 +153,6 @@ There's a helper script to do this [here](./tests/KAT/build_kats.sh). ```bash # This example runs the basic tests for kyber768 cargo test - -# This runs the KATs for kyber512 in 90's mode -RUSTFLAGS='--cfg kyber_kat' cargo test --features "kyber512 90s" ``` See the [testing readme](./tests/readme.md) for more comprehensive info. diff --git a/src/avx2/aes256ctr.rs b/src/avx2/aes256ctr.rs deleted file mode 100644 index f5346df..0000000 --- a/src/avx2/aes256ctr.rs +++ /dev/null @@ -1,177 +0,0 @@ -// Based heavily on public-domain code by Romain Dolbeau -// Different handling of nonce+counter than original version using -// separated 64-bit nonce and internal 64-bit counter, starting from zero -// Public Domain -#![cfg(feature="90s")] - -use core::arch::x86_64::*; - -#[derive(Clone, Copy)] -#[repr(C)] -pub(crate) struct Aes256CtrCtx { - pub rkeys: [__m128i; 16], - pub n: __m128i -} - -impl Aes256CtrCtx { - pub fn new() -> Self { - unsafe { - Self { - rkeys: [_mm_setzero_si128(); 16], - n: _mm_setzero_si128() - } - } - } -} - -unsafe fn aesni_encrypt4(out: &mut[u8], n :&mut __m128i, rkeys: &[__m128i; 16]) -{ - let idx: __m128i = _mm_set_epi8(8,9,10,11,12,13,14,15,7,6,5,4,3,2,1,0); - - // Load current counter value - let mut f = _mm_load_si128(n); - - // Increase counter in 4 consecutive blocks - let mut f0 = _mm_shuffle_epi8(_mm_add_epi64(f,_mm_set_epi64x(0,0)),idx); - let mut f1 = _mm_shuffle_epi8(_mm_add_epi64(f,_mm_set_epi64x(1,0)),idx); - let mut f2 = _mm_shuffle_epi8(_mm_add_epi64(f,_mm_set_epi64x(2,0)),idx); - let mut f3 = _mm_shuffle_epi8(_mm_add_epi64(f,_mm_set_epi64x(3,0)),idx); - - // Write counter for next iteration, increased by 4 - _mm_store_si128(n as *mut __m128i,_mm_add_epi64(f,_mm_set_epi64x(4,0))); - - // Actual AES encryption, 4x interleaved4 - f = _mm_load_si128(&rkeys[0]); - f0 = _mm_xor_si128(f0,f); - f1 = _mm_xor_si128(f1,f); - f2 = _mm_xor_si128(f2,f); - f3 = _mm_xor_si128(f3,f); - - for i in 1..14 { - f = _mm_load_si128(&rkeys[i]); - f0 = _mm_aesenc_si128(f0,f); - f1 = _mm_aesenc_si128(f1,f); - f2 = _mm_aesenc_si128(f2,f); - f3 = _mm_aesenc_si128(f3,f); - } - - f = _mm_load_si128(&rkeys[14]); - f0 = _mm_aesenclast_si128(f0,f); - f1 = _mm_aesenclast_si128(f1,f); - f2 = _mm_aesenclast_si128(f2,f); - f3 = _mm_aesenclast_si128(f3,f); - - // Write results - _mm_storeu_si128(out[..].as_mut_ptr() as *mut __m128i, f0); - _mm_storeu_si128(out[16..].as_mut_ptr() as *mut __m128i, f1); - _mm_storeu_si128(out[32..].as_mut_ptr() as *mut __m128i, f2); - _mm_storeu_si128(out[48..].as_mut_ptr() as *mut __m128i, f3); -} - -// Casting aliases -unsafe fn cast_128i(x: __m128) -> __m128i -{ - _mm_castps_si128(x) -} - -unsafe fn cast_128(x: __m128i) -> __m128 -{ - _mm_castsi128_ps(x) -} - -pub(crate) fn aes256ctr_init(state: &mut Aes256CtrCtx, key: &[u8], nonce: [u8; 12]) -{ - unsafe { - let mut idx = 0; - let key0 = _mm_loadu_si128(key.as_ptr() as *const __m128i); - let key1 = _mm_loadu_si128(key[16..].as_ptr() as *const __m128i); - - state.n = _mm_loadl_epi64(nonce[..].as_ptr() as *const __m128i); - state.rkeys[idx] = key0; - idx += 1; - let mut temp0 = key0; - let mut temp1; - let mut temp2 = key1; - let mut temp4 = _mm_setzero_si128(); - - macro_rules! block1 { - ($imm:expr) => { - temp1 = _mm_aeskeygenassist_si128(temp2, $imm); - state.rkeys[idx] = temp2; - idx += 1; - temp4 = cast_128i(_mm_shuffle_ps(cast_128(temp4), cast_128(temp0), 0x10)); - temp0 = _mm_xor_si128(temp0, temp4); - temp4 = cast_128i(_mm_shuffle_ps(cast_128(temp4), cast_128(temp0), 0x8c)); - temp0 = _mm_xor_si128(temp0, temp4); - temp1 = cast_128i(_mm_shuffle_ps(cast_128(temp1), cast_128(temp1), 0xff)); - temp0 = _mm_xor_si128(temp0, temp1) - }; - } - - macro_rules! block2 { - ($imm:expr) => { - temp1 = _mm_aeskeygenassist_si128(temp0, $imm); - state.rkeys[idx] = temp0; - idx += 1; - temp4 = cast_128i(_mm_shuffle_ps(cast_128(temp4), cast_128(temp2), 0x10)); - temp2 = _mm_xor_si128(temp2, temp4); - temp4 = cast_128i(_mm_shuffle_ps(cast_128(temp4), cast_128(temp2), 0x8c)); - temp2 = _mm_xor_si128(temp2, temp4); - temp1 = cast_128i(_mm_shuffle_ps(cast_128(temp1), cast_128(temp1), 0xaa)); - temp2 = _mm_xor_si128(temp2, temp1) - }; - } - - block1!(0x01); - block2!(0x01); - block1!(0x02); - block2!(0x02); - - block1!(0x04); - block2!(0x04); - block1!(0x08); - block2!(0x08); - - block1!(0x10); - block2!(0x10); - block1!(0x20); - block2!(0x20); - - block1!(0x40); - state.rkeys[idx] = temp0; - } -} - -pub(crate) fn aes256ctr_squeezeblocks(out: &mut[u8], nblocks: usize, state: &mut Aes256CtrCtx) -{ - let mut idx = 0; - for _ in 0..nblocks { - unsafe { aesni_encrypt4(&mut out[idx..], &mut state.n, &state.rkeys); } - idx += 64 - } -} - -#[cfg(feature="90s")] -pub(crate) fn aes256ctr_prf(out: &mut[u8], mut outlen: usize, seed: &[u8], nonce: u8) -{ - let mut buf = [0u8; 64]; - let mut idx = 0; - let mut pad_nonce = [0u8; 12]; - let mut state = unsafe{ - Aes256CtrCtx{rkeys: [ _mm_setzero_si128(); 16], n: _mm_setzero_si128()} - }; - - pad_nonce[0] = nonce; - aes256ctr_init(&mut state, seed, pad_nonce); - - while outlen >= 64 { - unsafe { aesni_encrypt4(&mut out[idx..], &mut state.n, &state.rkeys); } - outlen -= 64; - idx += 64; - } - - if outlen != 0 { - unsafe { aesni_encrypt4(&mut buf, &mut state.n, &state.rkeys); } - out[idx..][..outlen].copy_from_slice(&buf[..outlen]); - } -} \ No newline at end of file diff --git a/src/avx2/align.rs b/src/avx2/align.rs deleted file mode 100644 index a3eeba2..0000000 --- a/src/avx2/align.rs +++ /dev/null @@ -1,110 +0,0 @@ -#![allow(dead_code)] - -use core::arch::x86_64::*; -use crate::params::*; -use crate::poly::NOISE_NBLOCKS; -use crate::fips202::{SHAKE128_RATE, SHAKE256_RATE}; -use crate::symmetric::*; -use crate::avx2::rejsample::REJ_UNIFORM_AVX_NBLOCKS; - -// Buffer unions -// #[derive(Copy, Clone)] -// #[repr(C, align(8))] -// pub(crate) union Align8 { -// pub coeffs: [u8; N], -// pub vec: [__m256i; V] -// } - -// impl Align8 { -// pub fn new() -> Self { -// Self { -// coeffs: [0u8; N] -// } -// } -// } - -#[derive(Copy, Clone)] -#[repr(C, align(32))] -pub union GenMatrixBuf { - pub coeffs: [u8; REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE], - pub vec: [__m256i; (REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE+31)/32] -} - -impl GenMatrixBuf { - pub fn new() -> Self { - Self { coeffs: [0u8; REJ_UNIFORM_AVX_NBLOCKS*SHAKE128_RATE]} - } -} - -#[cfg(feature="90s")] -#[repr(C)] -pub union GenMatrixBuf90s { - pub coeffs: - [u8; REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES], - pub vec: - [__m256i; (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES+31)/32] -} - -#[cfg(feature="90s")] -impl GenMatrixBuf90s { - pub fn new() -> Self { - Self { - coeffs: [0u8; REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES] - } - } - - #[cfg(debug_assertions)] - pub fn checksum(&self) -> i16 { - let mut out = 0; - for i in 0..REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES { - unsafe { out ^= self.coeffs[i] as i16; } - } - out - } -} - -#[repr(C)] -pub union IndcpaBuf { - pub coeffs: [u8; - (KYBER_ETA1*KYBER_N/4) - /XOF_BLOCKBYTES*XOF_BLOCKBYTES+32], - pub vec: [__m256i; - ((KYBER_ETA1*KYBER_N/4) - /XOF_BLOCKBYTES*XOF_BLOCKBYTES+32+31)/32] -} - -impl IndcpaBuf { - pub fn new() -> Self { - Self { - coeffs: [0u8; - (KYBER_ETA1*KYBER_N/4) - /XOF_BLOCKBYTES*XOF_BLOCKBYTES+32] - } - } -} - -#[repr(C, align(8))] -pub union Eta2Buf { - pub coeffs: [u8; KYBER_ETA2*KYBER_N/4], - pub vec: [__m256i; (KYBER_ETA2*KYBER_N/4+31)/32] -} - -impl Eta2Buf { - pub fn new() -> Self { - Self { coeffs: [0u8; KYBER_ETA2*KYBER_N/4] } - } -} - -#[derive(Copy, Clone)] -#[repr(C, align(8))] -pub union Eta4xBuf { - pub coeffs: [u8; NOISE_NBLOCKS*SHAKE256_RATE], - pub vec: [__m256i; (NOISE_NBLOCKS*SHAKE256_RATE+31)/32] -} - -impl Eta4xBuf { - pub fn new() -> Self { - Self { coeffs: [0u8; NOISE_NBLOCKS*SHAKE256_RATE] } - } -} - diff --git a/src/avx2/basemul.S b/src/avx2/basemul.S deleted file mode 100644 index c16da49..0000000 --- a/src/avx2/basemul.S +++ /dev/null @@ -1,107 +0,0 @@ -#include "consts.h" - -.macro schoolbook off -vmovdqa _16XQINV*2(%rcx),%ymm0 -vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0 -vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0 -vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1 -vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1 - -vpmullw %ymm0,%ymm1,%ymm9 # a0.lo -vpmullw %ymm0,%ymm2,%ymm10 # b0.lo -vpmullw %ymm0,%ymm3,%ymm11 # a1.lo -vpmullw %ymm0,%ymm4,%ymm12 # b1.lo - -vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0 -vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0 - -vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi -vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi -vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi -vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi - -vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1 -vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1 - -vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi -vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi -vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi -vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi - -vmovdqa %ymm13,(%rsp) - -vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo -vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo -vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo -vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo - -vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo -vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo -vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo -vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo - -vmovdqa _16XQ*2(%rcx),%ymm8 -vpmulhw %ymm8,%ymm13,%ymm13 -vpmulhw %ymm8,%ymm9,%ymm9 -vpmulhw %ymm8,%ymm5,%ymm5 -vpmulhw %ymm8,%ymm10,%ymm10 -vpmulhw %ymm8,%ymm6,%ymm6 -vpmulhw %ymm8,%ymm11,%ymm11 -vpmulhw %ymm8,%ymm7,%ymm7 -vpmulhw %ymm8,%ymm12,%ymm12 - -vpsubw (%rsp),%ymm13,%ymm13 # -a0c0 -vpsubw %ymm9,%ymm1,%ymm9 # a0d0 -vpsubw %ymm5,%ymm14,%ymm5 # b0c0 -vpsubw %ymm10,%ymm2,%ymm10 # b0d0 - -vpsubw %ymm6,%ymm15,%ymm6 # a1c1 -vpsubw %ymm11,%ymm3,%ymm11 # a1d1 -vpsubw %ymm7,%ymm0,%ymm7 # b1c1 -vpsubw %ymm12,%ymm4,%ymm12 # b1d1 - -vmovdqa (%r9),%ymm0 -vmovdqa 32(%r9),%ymm1 -vpmullw %ymm0,%ymm10,%ymm2 -vpmullw %ymm0,%ymm12,%ymm3 -vpmulhw %ymm1,%ymm10,%ymm10 -vpmulhw %ymm1,%ymm12,%ymm12 -vpmulhw %ymm8,%ymm2,%ymm2 -vpmulhw %ymm8,%ymm3,%ymm3 -vpsubw %ymm2,%ymm10,%ymm10 # rb0d0 -vpsubw %ymm3,%ymm12,%ymm12 # rb1d1 - -vpaddw %ymm5,%ymm9,%ymm9 -vpaddw %ymm7,%ymm11,%ymm11 -vpsubw %ymm13,%ymm10,%ymm13 -vpsubw %ymm12,%ymm6,%ymm6 - -vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) -vmovdqa %ymm9,(64*\off+16)*2(%rdi) -vmovdqa %ymm6,(64*\off+32)*2(%rdi) -vmovdqa %ymm11,(64*\off+48)*2(%rdi) -.endm - -.text -.global basemul_avx -.global _basemul_avx -basemul_avx: -_basemul_avx: -mov %rsp,%r8 -and $-32,%rsp -sub $32,%rsp - -lea (_ZETAS_EXP+176)*2(%rcx),%r9 -schoolbook 0 - -add $32*2,%r9 -schoolbook 1 - -add $192*2,%r9 -schoolbook 2 - -add $32*2,%r9 -schoolbook 3 - -mov %r8,%rsp -ret diff --git a/src/avx2/cbd.rs b/src/avx2/cbd.rs deleted file mode 100644 index 4bbc237..0000000 --- a/src/avx2/cbd.rs +++ /dev/null @@ -1,136 +0,0 @@ - -#![allow(non_snake_case, dead_code)] -use core::arch::x86_64::*; -use crate::params::KYBER_N; -use crate::poly::*; -use crate::align::Eta4xBuf; -#[cfg(feature="90s")] -use crate::align::IndcpaBuf; - -fn cbd2(r: &mut Poly, buf: &[__m256i]) { - unsafe { - let mask55: __m256i = _mm256_set1_epi32(0x55555555); - let mask33: __m256i = _mm256_set1_epi32(0x33333333); - let mask03: __m256i = _mm256_set1_epi32(0x03030303); - let mask0F: __m256i = _mm256_set1_epi32(0x0F0F0F0F); - let (mut f0, mut f1, mut f2, mut f3); - for i in 0..(KYBER_N/64) { - f0 = _mm256_load_si256(&buf[i]); - - f1 = _mm256_srli_epi16(f0, 1); - f0 = _mm256_and_si256(mask55, f0); - f1 = _mm256_and_si256(mask55, f1); - f0 = _mm256_add_epi8(f0, f1); - - f1 = _mm256_srli_epi16(f0, 2); - f0 = _mm256_and_si256(mask33, f0); - f1 = _mm256_and_si256(mask33, f1); - f0 = _mm256_add_epi8(f0, mask33); - f0 = _mm256_sub_epi8(f0, f1); - - f1 = _mm256_srli_epi16(f0, 4); - f0 = _mm256_and_si256(mask0F, f0); - f1 = _mm256_and_si256(mask0F, f1); - f0 = _mm256_sub_epi8(f0, mask03); - f1 = _mm256_sub_epi8(f1, mask03); - - f2 = _mm256_unpacklo_epi8(f0, f1); - f3 = _mm256_unpackhi_epi8(f0, f1); - - f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2)); - f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1)); - f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3)); - f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1)); - - _mm256_store_si256(&mut r.vec[4*i+0], f0); - _mm256_store_si256(&mut r.vec[4*i+1], f2); - _mm256_store_si256(&mut r.vec[4*i+2], f1); - _mm256_store_si256(&mut r.vec[4*i+3], f3); - } - } -} - -fn cbd3(r: &mut Poly, buf: &[u8]) { - unsafe { - let (mut f0, mut f1, mut f2, mut f3); - let mask249: __m256i = _mm256_set1_epi32(0x249249); - let mask6DB: __m256i = _mm256_set1_epi32(0x6DB6DB); - let mask07: __m256i = _mm256_set1_epi32(7); - let mask70: __m256i = _mm256_set1_epi32(7 << 16); - let mask: __m256i = _mm256_set1_epi16(3); - let shufbidx: __m256i = _mm256_set_epi8( - -1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4, - -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0 - ); - - for i in 0..(KYBER_N/32) { - f0 = _mm256_loadu_si256(buf[24*i..].as_ptr() as *const __m256i); - f0 = _mm256_permute4x64_epi64(f0,0x94); - f0 = _mm256_shuffle_epi8(f0,shufbidx); - - f1 = _mm256_srli_epi32(f0,1); - f2 = _mm256_srli_epi32(f0,2); - f0 = _mm256_and_si256(mask249,f0); - f1 = _mm256_and_si256(mask249,f1); - f2 = _mm256_and_si256(mask249,f2); - f0 = _mm256_add_epi32(f0,f1); - f0 = _mm256_add_epi32(f0,f2); - - f1 = _mm256_srli_epi32(f0,3); - f0 = _mm256_add_epi32(f0,mask6DB); - f0 = _mm256_sub_epi32(f0,f1); - - f1 = _mm256_slli_epi32(f0,10); - f2 = _mm256_srli_epi32(f0,12); - f3 = _mm256_srli_epi32(f0, 2); - f0 = _mm256_and_si256(f0,mask07); - f1 = _mm256_and_si256(f1,mask70); - f2 = _mm256_and_si256(f2,mask07); - f3 = _mm256_and_si256(f3,mask70); - f0 = _mm256_add_epi16(f0,f1); - f1 = _mm256_add_epi16(f2,f3); - f0 = _mm256_sub_epi16(f0,mask); - f1 = _mm256_sub_epi16(f1,mask); - - f2 = _mm256_unpacklo_epi32(f0,f1); - f3 = _mm256_unpackhi_epi32(f0,f1); - - f0 = _mm256_permute2x128_si256(f2,f3,0x20); - f1 = _mm256_permute2x128_si256(f2,f3,0x31); - - _mm256_store_si256(&mut r.vec[2*i+0], f0); - _mm256_store_si256(&mut r.vec[2*i+1], f1); - } - } -} - -pub fn poly_cbd_eta1(r: &mut Poly, buf: &Eta4xBuf) -{ - unsafe { - if cfg!(feature="kyber512") { - cbd3(r, &buf.coeffs) - } - else { - cbd2(r, &buf.vec) - } - } -} - -#[cfg(feature="90s")] -pub fn poly_cbd_eta1_90s(r: &mut Poly, buf: &IndcpaBuf) -{ - unsafe { - if cfg!(feature="kyber512") { - cbd3(r, &buf.coeffs) - } - else { - cbd2(r, &buf.vec) - } - } -} - - -pub fn poly_cbd_eta2(r: &mut Poly, buf: &[__m256i]) -{ - cbd2(r, &buf) -} \ No newline at end of file diff --git a/src/avx2/consts.h b/src/avx2/consts.h deleted file mode 100644 index 966e150..0000000 --- a/src/avx2/consts.h +++ /dev/null @@ -1,38 +0,0 @@ -#define _16XQ 0 -#define _16XQINV 16 -#define _16XV 32 -#define _16XFLO 48 -#define _16XFHI 64 -#define _16XMONTSQLO 80 -#define _16XMONTSQHI 96 -#define _16XMASK 112 -#define _REVIDXB 128 -#define _REVIDXD 144 -#define _ZETAS_EXP 160 -#define _16XSHIFT 624 - -/* The C ABI on MacOS exports all symbols with a leading - * underscore. This means that any symbols we refer to from - * C files (functions) can't be found, and all symbols we - * refer to from ASM also can't be found. - * - * This define helps us get around this - */ -// #ifdef __ASSEMBLER__ -// #if defined(__WIN32__) || defined(__APPLE__) -// #define decorate(s) _##s -// #define cdecl2(s) decorate(s) -// #define cdecl(s) cdecl2(KYBER_NAMESPACE(##s)) -// #else -// #define cdecl(s) KYBER_NAMESPACE(##s) -// #endif -// #endif - -// #ifndef __ASSEMBLER__ -// #include "align.h" -// typedef ALIGNED_INT16(640) qdata_t; -// #define qdata KYBER_NAMESPACE(qdata) -// extern const qdata_t qdata; -// #endif - -// #endif diff --git a/src/avx2/consts.rs b/src/avx2/consts.rs deleted file mode 100644 index b1b22b3..0000000 --- a/src/avx2/consts.rs +++ /dev/null @@ -1,116 +0,0 @@ -use core::arch::x86_64::*; -use crate::params::KYBER_Q; - -pub(crate) const Q: i16 = KYBER_Q as i16; -// pub(crate) const MONT: i16 = -1044; // 2^16 mod q -pub(crate) const QINV: i16 = -3327; // q^-1 mod 2^16 -pub(crate) const V: i16 = 20159; // floor(2^26/q + 0.5) -pub(crate) const FHI: i16 = 1441; // mont^2/128 -pub(crate) const FLO: i16 = -10079; // qinv*FHI -pub(crate) const MONTSQHI: i16 = 1353; // mont^2 -pub(crate) const MONTSQLO: i16 = 20553; // qinv*MONTSQHI -pub(crate) const MASK: i16 = 4095; -pub(crate) const SHIFT: i16 = 32; - -pub(crate) const _16XQ: usize = 0; -pub(crate) const _16XQINV: usize = 16; -pub(crate) const _16XV: usize = 32; -pub(crate) const _16XFLO: usize = 48; -pub(crate) const _16XFHI: usize = 64; -pub(crate) const _16XMONTSQLO: usize = 80; -pub(crate) const _16XMONTSQHI: usize = 96; -pub(crate) const _16XMASK: usize = 112; -pub(crate) const _REVIDXB: usize = 128; -pub(crate) const _REVIDXD: usize = 144; -pub(crate) const _ZETAS_EXP: usize = 160; -pub(crate) const _16XSHIFT: usize = 624; - -#[repr(C, align(32))] -pub union Qdata { - pub coeffs: [i16; 640], - pub vec: [__m256i; 40] -} - -pub const QDATA: Qdata = Qdata { coeffs: -[ Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, - QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, - QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, - V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, - FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, - FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, - FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, - FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, - MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, - MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, - MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, - MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, - MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, - MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, - MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, - MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, - MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, - MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, - 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, - 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, - 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0, - 31498, 31498, 31498, 31498, -758, -758, -758, -758, - 5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397, - 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, - 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, - -359, -359, -359, -359, -359, -359, -359, -359, - -359, -359, -359, -359, -359, -359, -359, -359, - 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, --12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, --20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758, - -3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690, - -171, -171, -171, -171, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 182, 182, 182, 182, - -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, - 5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242, - 573, 573, -1325, -1325, 264, 264, 383, 383, - -829, -829, 1458, 1458, -1602, -1602, -130, -130, - -5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080, --12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, - 1223, 652, -552, 1015, -1293, 1491, -282, -1544, - 516, -8, -320, -666, -1618, -1162, 126, 1469, - -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, - 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, - -1103, 555, -1251, 1550, 422, 177, -291, 1574, - -246, 1159, -777, -602, -1590, -872, 418, -156, - 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, --32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619, - 430, 843, 871, 105, 587, -235, -460, 1653, - 778, -147, 1483, 1119, 644, 349, 329, -75, - 787, 787, 787, 787, 787, 787, 787, 787, - 787, 787, 787, 787, 787, 787, 787, 787, - -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, --16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, - 287, 287, 287, 287, 287, 287, 287, 287, - 202, 202, 202, 202, 202, 202, 202, 202, - 10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358, --11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164, - 962, 962, 962, 962, -1202, -1202, -1202, -1202, - -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, --28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800, - 18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163, - -681, -681, 1017, 1017, 732, 732, 608, 608, - -1542, -1542, 411, 411, -205, -205, -1571, -1571, - 19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249, - 13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915, - -853, -90, -271, 830, 107, -1421, -247, -951, - -398, 961, -1508, -725, 448, -1065, 677, -1275, --31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, - 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, - 817, 603, 1322, -1465, -1215, 1218, -874, -1187, - -1185, -1278, -1510, -870, -108, 996, 958, 1522, - 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469, --21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, - 1097, 610, -1285, 384, -136, -1335, 220, -1659, - -1530, 794, -854, 478, -308, 991, -1460, 1628, - SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, - SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT ] -}; diff --git a/src/avx2/fips202.rs b/src/avx2/fips202.rs deleted file mode 100644 index 8c5bc3c..0000000 --- a/src/avx2/fips202.rs +++ /dev/null @@ -1,597 +0,0 @@ -#![allow(clippy::needless_range_loop, dead_code)] -use crate::symmetric::KeccakState; - -pub(crate) const SHAKE128_RATE: usize = 168; -pub(crate) const SHAKE256_RATE: usize = 136; -const SHA3_256_RATE: usize = 136; -const SHA3_512_RATE: usize = 72; -const NROUNDS: usize = 24; - -fn rol(a: u64, offset: u64) -> u64 -{ - (a << offset) ^ (a >> (64-offset)) -} - -// Name: load64 -// -// Description: Load 8 bytes into u64 in little-endian order -// -// Arguments: - const [u8] x: input byte array -// -// Returns the loaded 64-bit unsigned integer -pub fn load64(x: &[u8]) -> u64 -{ - let mut r = 0u64; - for i in 0..8 { - r |= (x[i] as u64) << (8 * i); - } - r -} - -// Name: store64 -// -// Description: Store a 64-bit integer to a byte array in little-endian order -// -// Arguments: - [u8] x: the output byte array -// - u64 u: input 64-bit unsigned integer -pub fn store64(x: &mut[u8], mut u: u64) -{ - for i in x.iter_mut().take(8) { - *i = u as u8; - u >>= 8; - } -} - -// Keccak round constants -const KECCAKF_ROUNDCONSTANTS: [u64; NROUNDS] = [ - 0x0000000000000001, - 0x0000000000008082, - 0x800000000000808a, - 0x8000000080008000, - 0x000000000000808b, - 0x0000000080000001, - 0x8000000080008081, - 0x8000000000008009, - 0x000000000000008a, - 0x0000000000000088, - 0x0000000080008009, - 0x000000008000000a, - 0x000000008000808b, - 0x800000000000008b, - 0x8000000000008089, - 0x8000000000008003, - 0x8000000000008002, - 0x8000000000000080, - 0x000000000000800a, - 0x800000008000000a, - 0x8000000080008081, - 0x8000000000008080, - 0x0000000080000001, - 0x8000000080008008 -]; - -// Name: KeccakF1600_StatePermute -// -// Description: The Keccak F1600 Permutation -// -// Arguments: - u64 * state: in/output Keccak state -pub(crate) fn keccakf1600_statepermute(state: &mut[u64]) -{ - //copyFromState(A, state) - let mut aba = state[ 0]; - let mut abe = state[ 1]; - let mut abi = state[ 2]; - let mut abo = state[ 3]; - let mut abu = state[ 4]; - let mut aga = state[ 5]; - let mut age = state[ 6]; - let mut agi = state[ 7]; - let mut ago = state[ 8]; - let mut agu = state[ 9]; - let mut aka = state[10]; - let mut ake = state[11]; - let mut aki = state[12]; - let mut ako = state[13]; - let mut aku = state[14]; - let mut ama = state[15]; - let mut ame = state[16]; - let mut ami = state[17]; - let mut amo = state[18]; - let mut amu = state[19]; - let mut asa = state[20]; - let mut ase = state[21]; - let mut asi = state[22]; - let mut aso = state[23]; - let mut asu = state[24]; - - for round in (0..NROUNDS).step_by(2) { - // prepareTheta - let mut bca = aba^aga^aka^ama^asa; - let mut bce = abe^age^ake^ame^ase; - let mut bci = abi^agi^aki^ami^asi; - let mut bco = abo^ago^ako^amo^aso; - let mut bcu = abu^agu^aku^amu^asu; - - //thetaRhoPiChiIotaPrepareTheta(round , A, E) - let mut da = bcu^rol(bce, 1); - let mut de = bca^rol(bci, 1); - let mut di = bce^rol(bco, 1); - let mut d_o = bci^rol(bcu, 1); - let mut du = bco^rol(bca, 1); - - aba ^= da; - bca = aba; - age ^= de; - bce = rol(age, 44); - aki ^= di; - bci = rol(aki, 43); - amo ^= d_o; - bco = rol(amo, 21); - asu ^= du; - bcu = rol(asu, 14); - let mut eba = bca ^((!bce)& bci ); - eba ^= KECCAKF_ROUNDCONSTANTS[round]; - let mut ebe = bce ^((!bci)& bco ); - let mut ebi = bci ^((!bco)& bcu ); - let mut ebo = bco ^((!bcu)& bca ); - let mut ebu = bcu ^((!bca)& bce ); - - abo ^= d_o; - bca = rol(abo, 28); - agu ^= du; - bce = rol(agu, 20); - aka ^= da; - bci = rol(aka, 3); - ame ^= de; - bco = rol(ame, 45); - asi ^= di; - bcu = rol(asi, 61); - let mut ega = bca ^((!bce)& bci ); - let mut ege = bce ^((!bci)& bco ); - let mut egi = bci ^((!bco)& bcu ); - let mut ego = bco ^((!bcu)& bca ); - let mut egu = bcu ^((!bca)& bce ); - - abe ^= de; - bca = rol(abe, 1); - agi ^= di; - bce = rol(agi, 6); - ako ^= d_o; - bci = rol(ako, 25); - amu ^= du; - bco = rol(amu, 8); - asa ^= da; - bcu = rol(asa, 18); - let mut eka = bca ^((!bce)& bci ); - let mut eke = bce ^((!bci)& bco ); - let mut eki = bci ^((!bco)& bcu ); - let mut eko = bco ^((!bcu)& bca ); - let mut eku = bcu ^((!bca)& bce ); - - abu ^= du; - bca = rol(abu, 27); - aga ^= da; - bce = rol(aga, 36); - ake ^= de; - bci = rol(ake, 10); - ami ^= di; - bco = rol(ami, 15); - aso ^= d_o; - bcu = rol(aso, 56); - let mut ema = bca ^((!bce)& bci ); - let mut eme = bce ^((!bci)& bco ); - let mut emi = bci ^((!bco)& bcu ); - let mut emo = bco ^((!bcu)& bca ); - let mut emu = bcu ^((!bca)& bce ); - - abi ^= di; - bca = rol(abi, 62); - ago ^= d_o; - bce = rol(ago, 55); - aku ^= du; - bci = rol(aku, 39); - ama ^= da; - bco = rol(ama, 41); - ase ^= de; - bcu = rol(ase, 2); - let mut esa = bca ^((!bce)& bci ); - let mut ese = bce ^((!bci)& bco ); - let mut esi = bci ^((!bco)& bcu ); - let mut eso = bco ^((!bcu)& bca ); - let mut esu = bcu ^((!bca)& bce ); - - // prepareTheta - bca = eba^ega^eka^ema^esa; - bce = ebe^ege^eke^eme^ese; - bci = ebi^egi^eki^emi^esi; - bco = ebo^ego^eko^emo^eso; - bcu = ebu^egu^eku^emu^esu; - - //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) - da = bcu^rol(bce, 1); - de = bca^rol(bci, 1); - di = bce^rol(bco, 1); - d_o = bci^rol(bcu, 1); - du = bco^rol(bca, 1); - - eba ^= da; - bca = eba; - ege ^= de; - bce = rol(ege, 44); - eki ^= di; - bci = rol(eki, 43); - emo ^= d_o; - bco = rol(emo, 21); - esu ^= du; - bcu = rol(esu, 14); - aba = bca ^((!bce)& bci ); - aba ^= KECCAKF_ROUNDCONSTANTS[round+1]; - abe = bce ^((!bci)& bco ); - abi = bci ^((!bco)& bcu ); - abo = bco ^((!bcu)& bca ); - abu = bcu ^((!bca)& bce ); - - ebo ^= d_o; - bca = rol(ebo, 28); - egu ^= du; - bce = rol(egu, 20); - eka ^= da; - bci = rol(eka, 3); - eme ^= de; - bco = rol(eme, 45); - esi ^= di; - bcu = rol(esi, 61); - aga = bca ^((!bce)& bci ); - age = bce ^((!bci)& bco ); - agi = bci ^((!bco)& bcu ); - ago = bco ^((!bcu)& bca ); - agu = bcu ^((!bca)& bce ); - - ebe ^= de; - bca = rol(ebe, 1); - egi ^= di; - bce = rol(egi, 6); - eko ^= d_o; - bci = rol(eko, 25); - emu ^= du; - bco = rol(emu, 8); - esa ^= da; - bcu = rol(esa, 18); - aka = bca ^((!bce)& bci ); - ake = bce ^((!bci)& bco ); - aki = bci ^((!bco)& bcu ); - ako = bco ^((!bcu)& bca ); - aku = bcu ^((!bca)& bce ); - - ebu ^= du; - bca = rol(ebu, 27); - ega ^= da; - bce = rol(ega, 36); - eke ^= de; - bci = rol(eke, 10); - emi ^= di; - bco = rol(emi, 15); - eso ^= d_o; - bcu = rol(eso, 56); - ama = bca ^((!bce)& bci ); - ame = bce ^((!bci)& bco ); - ami = bci ^((!bco)& bcu ); - amo = bco ^((!bcu)& bca ); - amu = bcu ^((!bca)& bce ); - - ebi ^= di; - bca = rol(ebi, 62); - ego ^= d_o; - bce = rol(ego, 55); - eku ^= du; - bci = rol(eku, 39); - ema ^= da; - bco = rol(ema, 41); - ese ^= de; - bcu = rol(ese, 2); - asa = bca ^((!bce)& bci ); - ase = bce ^((!bci)& bco ); - asi = bci ^((!bco)& bcu ); - aso = bco ^((!bcu)& bca ); - asu = bcu ^((!bca)& bce ); - } - - state[ 0] = aba; - state[ 1] = abe; - state[ 2] = abi; - state[ 3] = abo; - state[ 4] = abu; - state[ 5] = aga; - state[ 6] = age; - state[ 7] = agi; - state[ 8] = ago; - state[ 9] = agu; - state[10] = aka; - state[11] = ake; - state[12] = aki; - state[13] = ako; - state[14] = aku; - state[15] = ama; - state[16] = ame; - state[17] = ami; - state[18] = amo; - state[19] = amu; - state[20] = asa; - state[21] = ase; - state[22] = asi; - state[23] = aso; - state[24] = asu; -} - -// Name: keccak_squeezeblocks -// -// Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -// Modifies the state. Can be called multiple times to keep squeezing, -// i.e., is incremental. -// -// Arguments: - [u8] h: output blocks -// - u64 nblocks: number of blocks to be squeezed (written to h) -// - u64 *s: in/output Keccak state -// - usize r: rate in bytes (e.g., 168 for SHAKE128) -pub(crate) fn keccak_squeezeblocks(h: &mut[u8], mut nblocks: usize, s: &mut [u64], r: usize) -{ - let mut idx = 0usize; - while nblocks > 0 { - keccakf1600_statepermute(s); - for i in 0..r/8 { - store64(&mut h[idx+8*i..], s[i]) - } - idx += r; - nblocks -= 1; - } -} - -// Name: shake128_squeezeblocks -// -// Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of -// SHAKE128_RATE bytes each. Can be called multiple times -// to keep squeezing. Assumes new block has not yet been -// started (state->pos = SHAKE128_RATE). -// -// Arguments: - [u8] out: pointer to output blocks -// - u64 nblocks: number of blocks to be squeezed (written to output) -// - KeccakState state: pointer to input/output Keccak state -pub(crate) fn shake128_squeezeblocks(out: &mut[u8], nblocks: usize, state: &mut KeccakState) -{ - keccak_squeezeblocks(out, nblocks, &mut state.s, SHAKE128_RATE); -} - -// Name: shake256 -// -// Description: SHAKE256 XOF with non-incremental API -// -// Arguments: - [u8] output: output -// - usize outlen: requested output length in bytes -// - [u8] input: input -// - usize inlen: length of input in bytes -pub(crate) fn shake256(out: &mut[u8], mut outlen: usize, input: &[u8], inlen: usize) -{ - let mut state = KeccakState::new(); - let mut idx = 0; - shake256_absorb_once(&mut state, input, inlen); - let nblocks = outlen/SHAKE256_RATE; - shake256_squeezeblocks(&mut out[idx..], nblocks, &mut state); - outlen -= nblocks*SHAKE256_RATE; - idx += nblocks*SHAKE256_RATE; - shake256_squeeze(&mut out[idx..], outlen, &mut state); -} - -// Name: sha3_256 -// -// Description: SHA3-256 with non-incremental API -// -// Arguments: - [u8] h: output (32 bytes) -// - const [u8] input: input -// - usize inlen: length of input in bytes -pub(crate) fn sha3_256(h: &mut[u8], input: &[u8], inlen: usize) -{ - let mut s = [0u64; 25]; - keccak_absorb_once(&mut s, SHA3_256_RATE, input, inlen, 0x06); - keccakf1600_statepermute(&mut s); - for i in 0..4 { - store64(&mut h[8*i..], s[i]); - } -} - -// Name: sha3_512 -// -// Description: SHA3-512 with non-incremental API -// -// Arguments: - [u8] h: output (64 bytes) -// - const [u8] input: input -// - usize inlen: length of input in bytes -pub(crate) fn sha3_512(h: &mut[u8], input: &[u8], inlen: usize) -{ - let mut s = [0u64; 25]; - keccak_absorb_once(&mut s, SHA3_512_RATE, input, inlen, 0x06); - keccakf1600_statepermute(&mut s); - for i in 0..8 { - store64(&mut h[8*i..], s[i]); - } -} - - - -// Name: keccak_finalize -// -// Description: Finalize absorb step. -// -// Arguments: - u64 s: pointer to Keccak state -// - usize pos: position in current block to be absorbed -// - usize r: rate in bytes (e.g., 168 for SHAKE128) -// - u8 p: domain separation byte -fn keccak_finalize(s: &mut[u64], pos: usize, r: usize, p: u8) -{ - s[pos/8] ^= (p as u64) << 8*(pos%8); - s[r/8-1] ^= 1u64 << 63; -} - -// Name: keccak_absorb_once -// -// Description: Absorb step of Keccak; -// non-incremental, starts by zeroing the state. -// -// Arguments: - u64 *s: (uninitialized) output Keccak state -// - usize r: rate in bytes (e.g., 168 for SHAKE128) -// - const [u8] input: input to be absorbed into s -// - u64 mlen: length of input in bytes -// - [u8] p: domain-separation byte for different Keccak-derived functions -pub(crate) fn keccak_absorb_once( - s: &mut[u64], - r: usize, - input: &[u8], - mut inlen: - usize, - p: u8) -{ - // Zero State - for i in s.iter_mut() { - *i = 0; - } - - let mut idx = 0usize; - while inlen >= r { - for i in 0..(r/8) { - s[i] ^= load64(&input[idx+8*i..]); - } - idx += r; - inlen -= r; - keccakf1600_statepermute(s); - } - - for i in 0..inlen { - s[i/8] ^= (input[idx+i] as u64) << 8*(i%8); - } - s[inlen/8] ^= (p as u64) << 8*(inlen%8); - s[(r-1)/8] ^= 1u64 << 63; -} - -// Name: keccak_squeeze -// -// Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. -// Modifies the state. Can be called multiple times to keep squeezing, -// i.e., is incremental. -// -// Arguments: - [u8] out: output blocks -// - u64 nblocks: number of blocks to be squeezed (written to out) -// - u64 *s: in/output Keccak state -// usize pos: number of bytes in current block already squeezed -// - usize r: rate in bytes (e.g., 168 for SHAKE128) -// Returns new position pos in current block -pub(crate) fn keccak_squeeze( - out: &mut[u8], - mut outlen: usize, - s: &mut [u64], - mut pos: usize, - r: usize -) -> usize -{ - let mut idx = 0; - while outlen > 0 { - if pos == r { - keccakf1600_statepermute(s); - pos = 0 - } - let mut i = pos; - while i < r && i < pos+outlen { - out[idx] = (s[i/8] >> 8*(i%8)) as u8; - i += 1; - idx += 1; - } - outlen -= i-pos; - pos = i; - } - pos -} - -// Name: shake128_init -// -// Description: Initializes Keccak state for use as SHAKE128 XOF -// -// Arguments: - keccak_state state: (uninitialized) Keccak state -fn shake128_init(state: &mut KeccakState) -{ - state.reset() -} - - -// Name: shake128_finalize -// -// Description: Finalize absorb step of the SHAKE128 XOF. -// -// Arguments: - keccak_state state: pointer to Keccak state -fn shake128_finalize(state: &mut KeccakState) -{ - keccak_finalize(&mut state.s, state.pos, SHAKE128_RATE, 0x1F); - state.pos = SHAKE128_RATE; -} - -// Name: shake128_squeeze -// -// Description: Squeeze step of SHAKE128 XOF. Squeezes arbitrarily many -// bytes. Can be called multiple times to keep squeezing. -// -// Arguments: - [u8] out: pointer to output blocks -// - usize outlen : number of bytes to be squeezed (written to output) -// - keccak_state s: pointer to input/output Keccak state -fn shake128_squeeze(out: &mut[u8], outlen: usize, state: &mut KeccakState) -{ - state.pos = keccak_squeeze(out, outlen, &mut state.s, state.pos, SHAKE128_RATE); -} - -// Name: shake128_absorb_once -// -// Description: Initialize, absorb into and finalize SHAKE128 XOF; non-incremental. -// -// Arguments: - keccak_state state: pointer to (uninitialized) output Keccak state -// - const [u8] in: input to be absorbed into s -// - usize inlen: length of input in bytes -pub(crate) fn shake128_absorb_once(state: &mut KeccakState, input: &[u8], inlen: usize) -{ - keccak_absorb_once(&mut state.s, SHAKE128_RATE, input, inlen, 0x1F); - state.pos = SHAKE128_RATE; -} - -fn shake256_init(state: &mut KeccakState) { - state.reset(); -} - -fn shake256_finalize(state: &mut KeccakState) -{ - keccak_finalize(&mut state.s, state.pos, SHAKE256_RATE, 0x1F); - state.pos = SHAKE256_RATE; -} - -fn shake256_squeeze(out: &mut[u8], outlen: usize, state: &mut KeccakState) -{ - state.pos = keccak_squeeze(out, outlen, &mut state.s, state.pos, SHAKE256_RATE); -} - -pub(crate) fn shake256_absorb_once(state: &mut KeccakState, input: &[u8], inlen: usize) -{ - keccak_absorb_once(&mut state.s, SHAKE256_RATE, input, inlen, 0x1F); - state.pos = SHAKE256_RATE; -} - -fn shake256_squeezeblocks(out: &mut[u8], nblocks: usize, state: &mut KeccakState) -{ - keccak_squeezeblocks(out, nblocks, &mut state.s, SHAKE256_RATE); -} - -fn shake128(out: &mut[u8], mut outlen: usize, input: &[u8], inlen: usize) -{ - let mut state = KeccakState::new(); - let mut idx = 0; - shake128_absorb_once(&mut state, input, inlen); - let nblocks = outlen/SHAKE128_RATE; - shake128_squeezeblocks(&mut out[idx..], nblocks, &mut state); - outlen -= nblocks*SHAKE128_RATE; - idx += nblocks*SHAKE128_RATE; - shake128_squeeze(&mut out[idx..], outlen, &mut state); -} - diff --git a/src/avx2/fips202x4.rs b/src/avx2/fips202x4.rs deleted file mode 100644 index 516d765..0000000 --- a/src/avx2/fips202x4.rs +++ /dev/null @@ -1,280 +0,0 @@ -#![allow(dead_code)] - -use core::arch::x86_64::*; -use crate::fips202::*; -use crate::keccak4x::f1600_x4; -use crate::align::{GenMatrixBuf, Eta4xBuf}; - -#[repr(C)] -pub struct Keccakx4State { - s: [__m256i; 25] -} - -impl Keccakx4State { - pub fn new() -> Self { - unsafe {Keccakx4State { s: [_mm256_setzero_si256(); 25]}} - } -} - -pub unsafe fn keccakx4_absorb_once( - s: &mut[__m256i; 25], - r: usize, - in0: &[u8], - in1: &[u8], - in2: &[u8], - in3: &[u8], - mut inlen: usize, - p: u8 -) -{ - let mut pos = 0i64; - let mut t; - for i in 0..25 { - s[i] = _mm256_setzero_si256(); - } - let mut idx = _mm256_set_epi64x( - in3.as_ptr() as i64, - in2.as_ptr() as i64, - in1.as_ptr() as i64, - in0.as_ptr() as i64, - ); - while inlen >= r { - for i in 0..(r/8) { - t = _mm256_i64gather_epi64(pos as *const i64, idx, 1); - s[i] = _mm256_xor_si256(s[i], t); - pos += 8; - } - inlen -= r; - f1600_x4(s); - } - let end = inlen/8; - for i in 0..end { - t = _mm256_i64gather_epi64(pos as *const i64, idx, 1); - s[i] = _mm256_xor_si256(s[i], t); - pos += 8; - } - inlen -= 8*end; - - if inlen > 0 { - t = _mm256_i64gather_epi64(pos as *const i64, idx, 1); - idx = _mm256_set1_epi64x(((1u64 << (8*inlen)) - 1) as i64); - t = _mm256_and_si256(t, idx); - s[end] = _mm256_xor_si256(s[end], t); - } - - t = _mm256_set1_epi64x(((p as u64) << 8*inlen) as i64); - s[end] = _mm256_xor_si256(s[end], t); - t = _mm256_set1_epi64x((1u64 << 63) as i64); - s[r/8 - 1] = _mm256_xor_si256(s[r/8 - 1], t); -} - -pub unsafe fn keccakx4_squeezeblocks128( - out: &mut [GenMatrixBuf; 4], - mut nblocks: usize, - r: usize, - s: &mut [__m256i; 25] -) -{ - let mut t; - let mut idx = 0usize; - while nblocks > 0 { - f1600_x4(s); - for i in 0..(r/8) { - t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); - let out0_ptr = out[0].coeffs[idx+8*i..].as_mut_ptr(); - let out1_ptr = out[1].coeffs[idx+8*i..].as_mut_ptr(); - _mm_storel_pd(out0_ptr as *mut f64, t); - _mm_storeh_pd(out1_ptr as *mut f64, t); - - t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1)); - let out2_ptr = out[2].coeffs[idx+8*i..].as_mut_ptr(); - let out3_ptr = out[3].coeffs[idx+8*i..].as_mut_ptr(); - _mm_storel_pd(out2_ptr as *mut f64, t); - _mm_storeh_pd(out3_ptr as *mut f64, t); - } - idx += r; - nblocks -= 1; - } -} - -pub unsafe fn keccakx4_squeezeblocks256( - out: &mut [Eta4xBuf; 4], - mut nblocks: usize, - r: usize, - s: &mut [__m256i; 25] -) -{ - let mut t; - let mut idx = 0usize; - while nblocks > 0 { - f1600_x4(s); - for i in 0..(r/8) { - t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); - _mm_storel_pd(out[0].coeffs[idx+8*i..].as_mut_ptr() as *mut f64, t); - _mm_storeh_pd(out[1].coeffs[idx+8*i..].as_mut_ptr() as *mut f64, t); - t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1)); - _mm_storel_pd(out[2].coeffs[idx+8*i..].as_mut_ptr() as *mut f64, t); - _mm_storeh_pd(out[3].coeffs[idx+8*i..].as_mut_ptr() as *mut f64, t); - } - idx += r; - nblocks -= 1; - } -} - -// pub unsafe fn keccakx4_squeezeonce128( -// out: &mut [[u8; 168]; 4], -// s: &mut [__m256i; 25] -// ) -// { -// let mut t; -// f1600_x4(s); -// for i in 0..(SHAKE128_RATE/8) { -// t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); -// let out0_ptr = out[0][8*i] as *mut f64; -// _mm_storel_pd(out0_ptr, t); -// _mm_storeh_pd(out[1][8*i] as *mut f64, t); -// t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1)); -// _mm_storel_pd(out[2][8*i] as *mut f64, t); -// _mm_storeh_pd(out[3][8*i] as *mut f64, t); -// } -// } - -// pub unsafe fn keccakx4_squeezeonce256( -// out: &mut [[u8; 136]; 4], -// s: &mut [__m256i; 25] -// ) -// { -// let mut t; -// f1600_x4(s); -// for i in 0..(SHAKE256_RATE/8) { -// t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); -// _mm_storel_pd(out[0][8*i] as *mut f64, t); -// _mm_storeh_pd(out[1][8*i] as *mut f64, t); -// t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i],1)); -// _mm_storel_pd(out[2][8*i] as *mut f64, t); -// _mm_storeh_pd(out[3][8*i] as *mut f64, t); -// } -// } - -pub unsafe fn shake128x4_absorb_once( - state: &mut Keccakx4State, - in0: &[u8], - in1: &[u8], - in2: &[u8], - in3: &[u8], - inlen: usize, -) -{ - keccakx4_absorb_once( - &mut state.s, - SHAKE128_RATE, - in0, in1, in2, in3, inlen, - 0x1F - ) -} - -pub unsafe fn shake128x4_squeezeblocks( - out: &mut[GenMatrixBuf; 4], - nblocks: usize, - state: &mut Keccakx4State -) -{ - keccakx4_squeezeblocks128( - out, - nblocks, - SHAKE128_RATE, - &mut state.s - ); -} - -pub unsafe fn shake256x4_absorb_once( - state: &mut Keccakx4State, - in0: &[u8], - in1: &[u8], - in2: &[u8], - in3: &[u8], - inlen: usize, -) -{ - keccakx4_absorb_once( - &mut state.s, - SHAKE256_RATE, - in0, in1, in2, in3, inlen, - 0x1F - ) -} - -pub unsafe fn shake256x4_squeezeblocks( - out: &mut[Eta4xBuf; 4], - nblocks: usize, - state: &mut Keccakx4State -) -{ - keccakx4_squeezeblocks256( - out, - nblocks, - SHAKE256_RATE, - &mut state.s - ); -} - -// pub unsafe fn shake128x4( -// out: &mut [GenMatrixBuf; 4], -// mut outlen: usize, -// in0: &[u8], -// in1: &[u8], -// in2: &[u8], -// in3: &[u8], -// inlen: usize -// ) -// { -// let nblocks = outlen/SHAKE128_RATE; -// let mut t = [[0u8; SHAKE128_RATE]; 4]; -// let mut state = Keccakx4State::new(); - -// shake128x4_absorb_once(&mut state, in0, in1, in2, in3, inlen); -// shake128x4_squeezeblocks(out, nblocks, &mut state); -// let idx = nblocks*SHAKE128_RATE; -// outlen -= idx; - -// if outlen > 0 { -// keccakx4_squeezeonce128(&mut t, &mut state.s); -// for i in 0..outlen { -// out[0].coeffs[idx+i] = t[0][i]; -// out[1].coeffs[idx+i] = t[1][i]; -// out[2].coeffs[idx+i] = t[2][i]; -// out[3].coeffs[idx+i] = t[3][i]; -// } -// } -// } - -// pub unsafe fn shake256x4( -// out: &mut [Eta4xBuf; 4], -// mut outlen: usize, -// in0: &[u8], -// in1: &[u8], -// in2: &[u8], -// in3: &[u8], -// inlen: usize -// ) -// { -// let nblocks = outlen/SHAKE256_RATE; -// let mut t = [[0u8; SHAKE256_RATE] ; 4]; -// let mut state = Keccakx4State::new(); - -// shake256x4_absorb_once(&mut state, in0, in1, in2, in3, inlen); -// shake256x4_squeezeblocks(out, nblocks, &mut state); - -// let idx = nblocks*SHAKE256_RATE; -// outlen -= nblocks*SHAKE256_RATE; - -// if outlen > 0 { -// keccakx4_squeezeonce256(&mut t, &mut state.s); -// for i in 0..outlen { -// out[0].coeffs[idx+i] = t[0][i]; -// out[1].coeffs[idx+i] = t[1][i]; -// out[2].coeffs[idx+i] = t[2][i]; -// out[3].coeffs[idx+i] = t[3][i]; -// } -// } -// } \ No newline at end of file diff --git a/src/avx2/fq.S b/src/avx2/fq.S deleted file mode 100644 index f595475..0000000 --- a/src/avx2/fq.S +++ /dev/null @@ -1,92 +0,0 @@ -#include "consts.h" -.include "fq.inc" - -.text -reduce128_avx: -#load -vmovdqa (%rdi),%ymm2 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm4 -vmovdqa 96(%rdi),%ymm5 -vmovdqa 128(%rdi),%ymm6 -vmovdqa 160(%rdi),%ymm7 -vmovdqa 192(%rdi),%ymm8 -vmovdqa 224(%rdi),%ymm9 - -red16 2 -red16 3 -red16 4 -red16 5 -red16 6 -red16 7 -red16 8 -red16 9 - -#store -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm4,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm6,128(%rdi) -vmovdqa %ymm7,160(%rdi) -vmovdqa %ymm8,192(%rdi) -vmovdqa %ymm9,224(%rdi) - -ret - -.global reduce_avx -.global _reduce_avx -reduce_avx: -_reduce_avx: -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -vmovdqa _16XV*2(%rsi),%ymm1 -call reduce128_avx -add $256,%rdi -call reduce128_avx -ret - -tomont128_avx: -#load -vmovdqa (%rdi),%ymm3 -vmovdqa 32(%rdi),%ymm4 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm6 -vmovdqa 128(%rdi),%ymm7 -vmovdqa 160(%rdi),%ymm8 -vmovdqa 192(%rdi),%ymm9 -vmovdqa 224(%rdi),%ymm10 - -fqmulprecomp 1,2,3,11 -fqmulprecomp 1,2,4,12 -fqmulprecomp 1,2,5,13 -fqmulprecomp 1,2,6,14 -fqmulprecomp 1,2,7,15 -fqmulprecomp 1,2,8,11 -fqmulprecomp 1,2,9,12 -fqmulprecomp 1,2,10,13 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm4,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm6,96(%rdi) -vmovdqa %ymm7,128(%rdi) -vmovdqa %ymm8,160(%rdi) -vmovdqa %ymm9,192(%rdi) -vmovdqa %ymm10,224(%rdi) - -ret - -.global tomont_avx -.global _tomont_avx -tomont_avx: -_tomont_avx: -#consts -vmovdqa _16XQ*2(%rsi),%ymm0 -vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 -vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 -call tomont128_avx -add $256,%rdi -call tomont128_avx -ret diff --git a/src/avx2/fq.inc b/src/avx2/fq.inc deleted file mode 100644 index 4b7afc3..0000000 --- a/src/avx2/fq.inc +++ /dev/null @@ -1,30 +0,0 @@ -.macro red16 r,rs=0,x=12 -vpmulhw %ymm1,%ymm\r,%ymm\x -.if \rs -vpmulhrsw %ymm\rs,%ymm\x,%ymm\x -.else -vpsraw $10,%ymm\x,%ymm\x -.endif -vpmullw %ymm0,%ymm\x,%ymm\x -vpsubw %ymm\x,%ymm\r,%ymm\r -.endm - -.macro csubq r,x=12 -vpsubw %ymm0,%ymm\r,%ymm\r -vpsraw $15,%ymm\r,%ymm\x -vpand %ymm0,%ymm\x,%ymm\x -vpaddw %ymm\x,%ymm\r,%ymm\r -.endm - -.macro caddq r,x=12 -vpsraw $15,%ymm\r,%ymm\x -vpand %ymm0,%ymm\x,%ymm\x -vpaddw %ymm\x,%ymm\r,%ymm\r -.endm - -.macro fqmulprecomp al,ah,b,x=12 -vpmullw %ymm\al,%ymm\b,%ymm\x -vpmulhw %ymm\ah,%ymm\b,%ymm\b -vpmulhw %ymm0,%ymm\x,%ymm\x -vpsubw %ymm\x,%ymm\b,%ymm\b -.endm diff --git a/src/avx2/indcpa.rs b/src/avx2/indcpa.rs deleted file mode 100644 index 0e66c65..0000000 --- a/src/avx2/indcpa.rs +++ /dev/null @@ -1,678 +0,0 @@ -use core::arch::x86_64::*; -#[cfg(not(feature = "90s"))] -use crate::{fips202::*, fips202x4::*}; -#[cfg(feature = "90s")] -use crate::{aes256ctr::*, cbd::*}; -use crate::rng::randombytes; -use crate::{ - align::*, - CryptoRng, - params::*, - poly::*, - polyvec::*, - rejsample::*, - RngCore, - symmetric::*, -}; - -// Name: pack_pk -// -// Description: Serialize the public key as concatenation of the -// serialized vector of polynomials pk -// and the public seed used to generate the matrix A. -// -// Arguments: [u8] r: the output serialized public key -// const poly *pk: the input public-key polynomial -// const [u8] seed: the input public seed -fn pack_pk(r: &mut[u8], pk: &Polyvec, seed: &[u8]) -{ - polyvec_tobytes(r, pk); - r[KYBER_POLYVECBYTES..][..KYBER_SYMBYTES] - .copy_from_slice(&seed[..KYBER_SYMBYTES]); -} - -// Name: unpack_pk -// -// Description: De-serialize public key from a byte array; -// approximate inverse of pack_pk -// -// Arguments: - Polyvec pk: output public-key vector of polynomials -// - [u8] seed: output seed to generate matrix A -// - const [u8] packedpk: input serialized public key -fn unpack_pk(pk: &mut Polyvec, seed: &mut[u8], packedpk: &[u8]) -{ - unsafe {polyvec_frombytes(pk, packedpk);} - seed[..KYBER_SYMBYTES] - .copy_from_slice(&packedpk[KYBER_POLYVECBYTES..][..KYBER_SYMBYTES]); -} - -// Name: pack_sk -// -// Description: Serialize the secret key -// -// Arguments: - [u8] r: output serialized secret key -// - const Polyvec sk: input vector of polynomials (secret key) -fn pack_sk(r: &mut[u8], sk: &Polyvec) -{ - polyvec_tobytes(r, sk); -} - -// Name: unpack_sk -// -// Description: De-serialize the secret key; -// inverse of pack_sk -// -// Arguments: - Polyvec sk: output vector of polynomials (secret key) -// - const [u8] packedsk: input serialized secret key -fn unpack_sk(sk: &mut Polyvec, packedsk: &[u8]) -{ - unsafe {polyvec_frombytes(sk, packedsk);} -} - -// Name: pack_ciphertext -// -// Description: Serialize the ciphertext as concatenation of the -// compressed and serialized vector of polynomials b -// and the compressed and serialized polynomial v -// -// Arguments: [u8] r: the output serialized ciphertext -// const poly *pk: the input vector of polynomials b -// const [u8] seed: the input polynomial v -fn pack_ciphertext(r: &mut[u8], b: &Polyvec, v: Poly) -{ - unsafe { - polyvec_compress(r, b); - poly_compress(&mut r[KYBER_POLYVECCOMPRESSEDBYTES..], v); - } -} - -// Name: unpack_ciphertext -// -// Description: De-serialize and decompress ciphertext from a byte array; -// approximate inverse of pack_ciphertext -// -// Arguments: - Polyvec b: output vector of polynomials b -// - Poly *v: output polynomial v -// - const [u8] c: input serialized ciphertext -fn unpack_ciphertext(b: &mut Polyvec, v: &mut Poly, c: &[u8]) -{ - unsafe { - polyvec_decompress(b, c); - poly_decompress(v, &c[KYBER_POLYVECCOMPRESSEDBYTES..]); - } -} - -// Name: rej_uniform -// -// Description: Run rejection sampling on uniform random bytes to generate -// uniform random integers mod q -// -// Arguments: - i16 *r: output buffer -// - usize len: requested number of 16-bit integers (uniform mod q) -// - const [u8] buf: input buffer (assumed to be uniform random bytes) -// - usize buflen: length of input buffer in bytes -// -// Returns number of sampled 16-bit integers (at most len) -fn rej_uniform(r: &mut[i16], len: usize, buf: &[u8], buflen: usize) -> usize -{ - let (mut ctr, mut pos) = (0usize, 0usize); - let (mut val0, mut val1); - - while ctr < len && pos + 3 <= buflen { - val0 = ((buf[pos+0] >> 0) as u16 | (buf[pos+1] as u16) << 8) & 0xFFF; - val1 = ((buf[pos+1] >> 4) as u16 | (buf[pos+2] as u16) << 4) & 0xFFF; - pos += 3; - - if val0 < KYBER_Q as u16 { - r[ctr] = val0 as i16; - ctr += 1; - } - if ctr < len && val1 < KYBER_Q as u16 { - r[ctr] = val1 as i16; - ctr += 1; - } - } - ctr -} - -pub fn gen_a(a: &mut[Polyvec], b: &[u8]) -{ - unsafe { gen_matrix(a, b, false); } -} - -pub fn gen_at(a: &mut[Polyvec], b: &[u8]) -{ - unsafe { gen_matrix(a, b, true); } -} - -#[cfg(feature="90s")] -unsafe fn gen_matrix(a: &mut[Polyvec], seed: &[u8], transposed: bool) -{ - let (mut ctr, mut off, mut buflen); - let mut nonce: u64; - let mut state = Aes256CtrCtx::new(); - let mut buf = GenMatrixBuf90s::new(); - aes256ctr_init(&mut state, seed, [0u8; 12]); - for i in 0..KYBER_K { - for j in 0..KYBER_K { - if transposed { - nonce = ((j << 8) | i) as u64; - } else { - nonce = ((i << 8) | j) as u64; - } - state.n = _mm_loadl_epi64([nonce].as_ptr() as *const __m128i); - aes256ctr_squeezeblocks(&mut buf.coeffs, REJ_UNIFORM_AVX_NBLOCKS, &mut state); - buflen = REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES; - ctr = rej_uniform_avx(&mut a[i].vec[j].coeffs, &buf.coeffs); - while ctr < KYBER_N { - off = buflen % 3; - for k in 0..off { - buf.coeffs[k] = buf.coeffs[buflen - off + k]; - } - aes256ctr_squeezeblocks(&mut buf.coeffs[off..], 1, &mut state); - buflen = off + XOF_BLOCKBYTES; - ctr += rej_uniform(&mut a[i].vec[j].coeffs[ctr..], KYBER_N-ctr, &buf.coeffs, buflen); - } - poly_nttunpack(&mut a[i].vec[j]); - } - } -} - -#[cfg(all(feature="kyber512", not(feature="90s")))] -unsafe fn gen_matrix(a: &mut[Polyvec], seed: &[u8], transposed: bool) -{ - let mut state = Keccakx4State::new(); - let mut buf = [GenMatrixBuf::new(); 4]; - - let f = _mm256_loadu_si256(seed[..].as_ptr() as *const __m256i); - _mm256_store_si256(buf[0].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[1].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[2].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[3].vec.as_mut_ptr(), f); - - if transposed { - buf[0].coeffs[32] = 0; - buf[0].coeffs[33] = 0; - buf[1].coeffs[32] = 0; - buf[1].coeffs[33] = 1; - buf[2].coeffs[32] = 1; - buf[2].coeffs[33] = 0; - buf[3].coeffs[32] = 1; - buf[3].coeffs[33] = 1; - } - else { - buf[0].coeffs[32] = 0; - buf[0].coeffs[33] = 0; - buf[1].coeffs[32] = 1; - buf[1].coeffs[33] = 0; - buf[2].coeffs[32] = 0; - buf[2].coeffs[33] = 1; - buf[3].coeffs[32] = 1; - buf[3].coeffs[33] = 1; - } - - shake128x4_absorb_once( - &mut state, &buf[0].coeffs, &buf[1].coeffs, &buf[2].coeffs, &buf[3].coeffs, 34 - ); - shake128x4_squeezeblocks(&mut buf, REJ_UNIFORM_AVX_NBLOCKS, &mut state); - - let mut ctr0 = rej_uniform_avx(&mut a[0].vec[0].coeffs, &buf[0].coeffs); - let mut ctr1 = rej_uniform_avx(&mut a[0].vec[1].coeffs, &buf[1].coeffs); - let mut ctr2 = rej_uniform_avx(&mut a[1].vec[0].coeffs, &buf[2].coeffs); - let mut ctr3 = rej_uniform_avx(&mut a[1].vec[1].coeffs, &buf[3].coeffs); - - while ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N { - shake128x4_squeezeblocks(&mut buf, 1, &mut state); - - ctr0 += rej_uniform( - &mut a[0].vec[0].coeffs[ctr0..], KYBER_N - ctr0, &buf[0].coeffs, SHAKE128_RATE - ); - ctr1 += rej_uniform( - &mut a[0].vec[1].coeffs[ctr1..], KYBER_N - ctr1, &buf[1].coeffs, SHAKE128_RATE - ); - ctr2 += rej_uniform( - &mut a[1].vec[0].coeffs[ctr2..], KYBER_N - ctr2, &buf[2].coeffs, SHAKE128_RATE - ); - ctr3 += rej_uniform( - &mut a[1].vec[1].coeffs[ctr3..], KYBER_N - ctr3, &buf[3].coeffs, SHAKE128_RATE - ); - } - - poly_nttunpack(&mut a[0].vec[0]); - poly_nttunpack(&mut a[0].vec[1]); - poly_nttunpack(&mut a[1].vec[0]); - poly_nttunpack(&mut a[1].vec[1]); -} - -#[cfg(all(not(feature="kyber512"), not(feature="kyber1024"), not(feature="90s")))] -unsafe fn gen_matrix(a: &mut[Polyvec], seed: &[u8], transposed: bool) -{ - let mut state = Keccakx4State::new(); - let mut state1x = KeccakState::new(); - let mut buf = [GenMatrixBuf::new(); 4]; - - let mut f = _mm256_loadu_si256(seed.as_ptr() as *const __m256i); - _mm256_store_si256(buf[0].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[1].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[2].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[3].vec.as_mut_ptr(), f); - - if transposed { - buf[0].coeffs[32] = 0; - buf[0].coeffs[33] = 0; - buf[1].coeffs[32] = 0; - buf[1].coeffs[33] = 1; - buf[2].coeffs[32] = 0; - buf[2].coeffs[33] = 2; - buf[3].coeffs[32] = 1; - buf[3].coeffs[33] = 0; - } - else { - buf[0].coeffs[32] = 0; - buf[0].coeffs[33] = 0; - buf[1].coeffs[32] = 1; - buf[1].coeffs[33] = 0; - buf[2].coeffs[32] = 2; - buf[2].coeffs[33] = 0; - buf[3].coeffs[32] = 0; - buf[3].coeffs[33] = 1; - } - - shake128x4_absorb_once( - &mut state, - &buf[0].coeffs, &buf[1].coeffs, - &buf[2].coeffs, &buf[3].coeffs, - 34 - ); - shake128x4_squeezeblocks( - &mut buf, - REJ_UNIFORM_AVX_NBLOCKS, &mut state - ); - - let mut ctr0 = rej_uniform_avx(&mut a[0].vec[0].coeffs, &buf[0].coeffs); - let mut ctr1 = rej_uniform_avx(&mut a[0].vec[1].coeffs, &buf[1].coeffs); - let mut ctr2 = rej_uniform_avx(&mut a[0].vec[2].coeffs, &buf[2].coeffs); - let mut ctr3 = rej_uniform_avx(&mut a[1].vec[0].coeffs, &buf[3].coeffs); - - while ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N { - shake128x4_squeezeblocks(&mut buf, 1, &mut state); - - ctr0 += rej_uniform( - &mut a[0].vec[0].coeffs[ctr0..], KYBER_N - ctr0, &buf[0].coeffs, SHAKE128_RATE - ); - ctr1 += rej_uniform( - &mut a[0].vec[1].coeffs[ctr1..], KYBER_N - ctr1, &buf[1].coeffs, SHAKE128_RATE - ); - ctr2 += rej_uniform( - &mut a[0].vec[2].coeffs[ctr2..], KYBER_N - ctr2, &buf[2].coeffs, SHAKE128_RATE - ); - ctr3 += rej_uniform( - &mut a[1].vec[0].coeffs[ctr3..], KYBER_N - ctr3, &buf[3].coeffs, SHAKE128_RATE - ); - } - - poly_nttunpack(&mut a[0].vec[0]); - poly_nttunpack(&mut a[0].vec[1]); - poly_nttunpack(&mut a[0].vec[2]); - poly_nttunpack(&mut a[1].vec[0]); - - f = _mm256_loadu_si256(seed.as_ptr() as *const __m256i); - _mm256_store_si256(buf[0].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[1].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[2].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[3].vec.as_mut_ptr(), f); - - if transposed { - buf[0].coeffs[32] = 1; - buf[0].coeffs[33] = 1; - buf[1].coeffs[32] = 1; - buf[1].coeffs[33] = 2; - buf[2].coeffs[32] = 2; - buf[2].coeffs[33] = 0; - buf[3].coeffs[32] = 2; - buf[3].coeffs[33] = 1; - } - else { - buf[0].coeffs[32] = 1; - buf[0].coeffs[33] = 1; - buf[1].coeffs[32] = 2; - buf[1].coeffs[33] = 1; - buf[2].coeffs[32] = 0; - buf[2].coeffs[33] = 2; - buf[3].coeffs[32] = 1; - buf[3].coeffs[33] = 2; - } - - shake128x4_absorb_once( - &mut state, - &buf[0].coeffs, &buf[1].coeffs, - &buf[2].coeffs, &buf[3].coeffs, - 34 - ); - shake128x4_squeezeblocks(&mut buf, REJ_UNIFORM_AVX_NBLOCKS, &mut state); - - ctr0 = rej_uniform_avx(&mut a[1].vec[1].coeffs, &buf[0].coeffs); - ctr1 = rej_uniform_avx(&mut a[1].vec[2].coeffs, &buf[1].coeffs); - ctr2 = rej_uniform_avx(&mut a[2].vec[0].coeffs, &buf[2].coeffs); - ctr3 = rej_uniform_avx(&mut a[2].vec[1].coeffs, &buf[3].coeffs); - - while ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N { - shake128x4_squeezeblocks(&mut buf, 1, &mut state); - - ctr0 += rej_uniform( - &mut a[1].vec[1].coeffs[ctr0..], KYBER_N - ctr0, &buf[0].coeffs, SHAKE128_RATE - ); - ctr1 += rej_uniform( - &mut a[1].vec[2].coeffs[ctr1..], KYBER_N - ctr1, &buf[1].coeffs, SHAKE128_RATE - ); - ctr2 += rej_uniform( - &mut a[2].vec[0].coeffs[ctr2..], KYBER_N - ctr2, &buf[2].coeffs, SHAKE128_RATE - ); - ctr3 += rej_uniform( - &mut a[2].vec[1].coeffs[ctr3..], KYBER_N - ctr3, &buf[3].coeffs, SHAKE128_RATE - ); - } - - poly_nttunpack(&mut a[1].vec[1]); - poly_nttunpack(&mut a[1].vec[2]); - poly_nttunpack(&mut a[2].vec[0]); - poly_nttunpack(&mut a[2].vec[1]); - - f = _mm256_loadu_si256(seed.as_ptr() as *const __m256i); - _mm256_store_si256(buf[0].vec.as_mut_ptr(), f); - buf[0].coeffs[32] = 2; - buf[0].coeffs[33] = 2; - shake128_absorb_once(&mut state1x, &buf[0].coeffs, 34); - shake128_squeezeblocks(&mut buf[0].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &mut state1x); - ctr0 = rej_uniform_avx(&mut a[2].vec[2].coeffs, &buf[0].coeffs); - while ctr0 < KYBER_N { - shake128_squeezeblocks(&mut buf[0].coeffs, 1, &mut state1x); - ctr0 += rej_uniform( - &mut a[2].vec[2].coeffs[ctr0..], KYBER_N - ctr0, &buf[0].coeffs, SHAKE128_RATE - ); - } - - poly_nttunpack(&mut a[2].vec[2]); -} - -#[cfg(all(feature="kyber1024", not(feature="90s")))] -unsafe fn gen_matrix(a: &mut[Polyvec], seed: &[u8], transposed: bool) -{ - let mut f; - let mut state = Keccakx4State::new(); - let mut buf = [GenMatrixBuf::new(); 4]; - - for i in 0..4usize { - f = _mm256_loadu_si256(seed[..].as_ptr() as *const __m256i); - _mm256_store_si256(buf[0].coeffs.as_mut_ptr() as *mut __m256i, f); - _mm256_store_si256(buf[1].coeffs.as_mut_ptr() as *mut __m256i, f); - _mm256_store_si256(buf[2].coeffs.as_mut_ptr() as *mut __m256i, f); - _mm256_store_si256(buf[3].coeffs.as_mut_ptr() as *mut __m256i, f); - - if transposed { - for j in 0..4 { - buf[j].coeffs[32] = i as u8; - buf[j].coeffs[33] = j as u8; - } - } else { - for j in 0..4 { - buf[j].coeffs[32] = j as u8; - buf[j].coeffs[33] = i as u8; - } - } - - shake128x4_absorb_once( - &mut state, - &buf[0].coeffs, &buf[1].coeffs, &buf[2].coeffs, &buf[3].coeffs, 34 - ); - shake128x4_squeezeblocks(&mut buf, REJ_UNIFORM_AVX_NBLOCKS, &mut state); - - let mut ctr0 = rej_uniform_avx(&mut a[i].vec[0].coeffs, &buf[0].coeffs); - let mut ctr1 = rej_uniform_avx(&mut a[i].vec[1].coeffs, &buf[1].coeffs); - let mut ctr2 = rej_uniform_avx(&mut a[i].vec[2].coeffs, &buf[2].coeffs); - let mut ctr3 = rej_uniform_avx(&mut a[i].vec[3].coeffs, &buf[3].coeffs); - - while ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N { - shake128x4_squeezeblocks(&mut buf, 1, &mut state); - - ctr0 += rej_uniform( - &mut a[i].vec[0].coeffs[ctr0..], KYBER_N - ctr0, &buf[0].coeffs, SHAKE128_RATE - ); - ctr1 += rej_uniform( - &mut a[i].vec[1].coeffs[ctr1..], KYBER_N - ctr1, &buf[1].coeffs, SHAKE128_RATE - ); - ctr2 += rej_uniform( - &mut a[i].vec[2].coeffs[ctr2..], KYBER_N - ctr2, &buf[2].coeffs, SHAKE128_RATE - ); - ctr3 += rej_uniform( - &mut a[i].vec[3].coeffs[ctr3..], KYBER_N - ctr3, &buf[3].coeffs, SHAKE128_RATE - ); - } - - poly_nttunpack(&mut a[i].vec[0]); - poly_nttunpack(&mut a[i].vec[1]); - poly_nttunpack(&mut a[i].vec[2]); - poly_nttunpack(&mut a[i].vec[3]); - } -} - -pub fn indcpa_keypair( - pk: &mut[u8], - sk: &mut[u8], - _seed: Option<(&[u8], &[u8])>, - _rng: &mut R -) - where R: CryptoRng + RngCore -{ - - let mut a = [Polyvec::new(); KYBER_K]; - let (mut e, mut pkpv, mut skpv) = (Polyvec::new(), Polyvec::new(), Polyvec::new()); - let mut buf = [0u8; 2*KYBER_SYMBYTES]; - let mut randbuf = [0u8; 2*KYBER_SYMBYTES]; - - if let Some(s) = _seed { - randbuf[..KYBER_SYMBYTES].copy_from_slice(&s.0); - } else { - randombytes(&mut randbuf, KYBER_SYMBYTES, _rng); - } - - hash_g(&mut buf, &randbuf, KYBER_SYMBYTES); - - let (publicseed, noiseseed) = buf.split_at(KYBER_SYMBYTES); - gen_a(&mut a, publicseed); - - #[cfg(feature="90s")] - { - // Assumes divisibility - const NOISE_NBLOCKS: usize = (KYBER_ETA1*KYBER_N/4)/XOF_BLOCKBYTES; - let mut nonce = 0u64; - let mut state = Aes256CtrCtx::new(); - let mut coins = IndcpaBuf::new(); - aes256ctr_init(&mut state, noiseseed, [0u8; 12]); - nonce += 1; - unsafe { - for i in 0..KYBER_K { - aes256ctr_squeezeblocks(&mut coins.coeffs, NOISE_NBLOCKS, &mut state); - state.n = _mm_loadl_epi64([nonce].as_ptr() as *const __m128i); - nonce += 1; - poly_cbd_eta1_90s(&mut skpv.vec[i], &coins); - } - for i in 0..KYBER_K { - aes256ctr_squeezeblocks(&mut coins.coeffs, NOISE_NBLOCKS, &mut state); - state.n = _mm_loadl_epi64([nonce].as_ptr() as *const __m128i); - nonce += 1; - poly_cbd_eta1_90s(&mut e.vec[i], &coins); - } - } - } - - #[cfg(all(feature="kyber512", not(feature="90s")))] - { - let (skpv0, skpv1) =skpv.vec.split_at_mut(1); - let (e0, e1) = e.vec.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut skpv0[0], &mut skpv1[0], &mut e0[0], &mut e1[0], noiseseed, 0, 1, 2, 3 - ); - } - - #[cfg(all(feature="kyber1024", not(feature="90s")))] - { - let (skpv0, skpv1) = skpv.vec.split_at_mut(1); - let (skpv1, skpv2) = skpv1.split_at_mut(1); - let (skpv2, skpv3) = skpv2.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut skpv0[0], &mut skpv1[0], &mut skpv2[0], &mut skpv3[0], noiseseed, 0, 1, 2, 3 - ); - let (e0, e1) = e.vec.split_at_mut(1); - let (e1, e2) = e1.split_at_mut(1); - let (e2, e3) = e2.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut e0[0], &mut e1[0], &mut e2[0], &mut e3[0], noiseseed, 4, 5, 6, 7 - ); - } - - #[cfg(not(any(feature="kyber1024", feature="kyber512", feature="90s")))] // kyber768 - { - let (skpv0, skpv1) = skpv.vec.split_at_mut(1); - let (skpv1, skpv2) = skpv1.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut skpv0[0], &mut skpv1[0], &mut skpv2[0], &mut e.vec[0], noiseseed, 0, 1, 2, 3 - ); - let (e1, e2) = e.vec.split_at_mut(2); - let (pkpv0, pkpv1) = pkpv.vec.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut e1[1], &mut e2[0], &mut pkpv0[0], &mut pkpv1[0], noiseseed, 4, 5, 6, 7 - ); - } - - polyvec_ntt(&mut skpv); - polyvec_reduce(&mut skpv); - polyvec_ntt(&mut e); - - for i in 0..KYBER_K { - polyvec_basemul_acc_montgomery(&mut pkpv.vec[i], &a[i], &skpv); - poly_tomont(&mut pkpv.vec[i]); - } - - polyvec_add(&mut pkpv, &e); - polyvec_reduce(&mut pkpv); - - pack_sk(sk, &skpv); - pack_pk(pk, &pkpv, publicseed); -} - -pub fn indcpa_enc(c: &mut[u8], m: &[u8], pk: &[u8], coins: &[u8]) -{ - unsafe { - let mut at = [Polyvec::new(); KYBER_K]; - let (mut sp, mut pkpv, mut ep, mut b) = (Polyvec::new(),Polyvec::new(), Polyvec::new(), Polyvec::new()); - let (mut v, mut k, mut epp) = (Poly::new(), Poly::new(), Poly::new()); - let mut seed = [0u8; KYBER_SYMBYTES]; - - unpack_pk(&mut pkpv, &mut seed, pk); - poly_frommsg(&mut k, m); - gen_at(&mut at, &seed); - - #[cfg(feature="90s")] - { - const NOISE_NBLOCKS: usize = (KYBER_ETA1*KYBER_N/4)/XOF_BLOCKBYTES; - const CIPHERTEXTNOISE_NBLOCKS: usize = (KYBER_ETA2*KYBER_N/4)/XOF_BLOCKBYTES; - let mut buf = IndcpaBuf::new(); - let mut state = Aes256CtrCtx::new(); - let mut nonce = 0u64; - aes256ctr_init(&mut state, coins, [0u8; 12]); - nonce += 1; - for i in 0..KYBER_K { - aes256ctr_squeezeblocks(&mut buf.coeffs, NOISE_NBLOCKS, &mut state); - state.n = _mm_loadl_epi64([nonce, 0].as_ptr() as *const __m128i); - nonce += 1; - poly_cbd_eta1_90s(&mut sp.vec[i], &buf); - } - for i in 0..KYBER_K { - aes256ctr_squeezeblocks(&mut buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &mut state); - state.n = _mm_loadl_epi64([nonce, 0].as_ptr() as *const __m128i); - nonce += 1; - poly_cbd_eta2(&mut ep.vec[i], &buf.vec); - } - aes256ctr_squeezeblocks(&mut buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &mut state); - poly_cbd_eta2(&mut epp, &buf.vec); - } - - #[cfg(all(feature="kyber512", not(feature="90s")))] - { - let (sp0, sp1) = sp.vec.split_at_mut(1); - let (ep0, ep1) = ep.vec.split_at_mut(1); - poly_getnoise_eta1122_4x( - &mut sp0[0], &mut sp1[0], &mut ep0[0], &mut ep1[0], coins, 0, 1, 2, 3 - ); - poly_getnoise_eta2(&mut epp, coins, 4); - } - - #[cfg(not(any(feature="kyber1024", feature="kyber512", feature="90s")))] // kyber768) - { - let (sp0, sp1) = sp.vec.split_at_mut(1); - let (sp1, sp2) = sp1.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut sp0[0], &mut sp1[0], &mut sp2[0], &mut ep.vec[0], coins, 0, 1, 2 ,3 - ); - let (ep1, ep2) = ep.vec.split_at_mut(2); - poly_getnoise_eta1_4x( - &mut ep1[1], &mut ep2[0], &mut epp, &mut b.vec[0], coins, 4, 5, 6, 7 - ); - } - - #[cfg(all(feature="kyber1024", not(feature="90s")))] - { - let (sp0, sp1) = sp.vec.split_at_mut(1); - let (sp1, sp2) = sp1.split_at_mut(1); - let (sp2, sp3) = sp2.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut sp0[0], &mut sp1[0], &mut sp2[0],&mut sp3[0], coins, 0, 1, 2, 3 - ); - let (ep0, ep1) = ep.vec.split_at_mut(1); - let (ep1, ep2) = ep1.split_at_mut(1); - let (ep2, ep3) = ep2.split_at_mut(1); - poly_getnoise_eta1_4x( - &mut ep0[0], &mut ep1[0], &mut ep2[0],&mut ep3[0], coins, 4, 5, 6, 7 - ); - poly_getnoise_eta2(&mut epp, coins, 8); - } - - polyvec_ntt(&mut sp); - - for i in 0..KYBER_K { - polyvec_basemul_acc_montgomery(&mut b.vec[i], &at[i], &sp); - } - polyvec_basemul_acc_montgomery(&mut v, &pkpv, &sp); - - polyvec_invntt_tomont(&mut b); - poly_invntt_tomont(&mut v); - - polyvec_add(&mut b, &ep); - poly_add(&mut v, &epp); - poly_add(&mut v, &k); - - polyvec_reduce(&mut b); - poly_reduce(&mut v); - - pack_ciphertext(c, &b, v); - } -} - -pub fn indcpa_dec(m: &mut[u8], c: &[u8], sk: &[u8]) -{ - let (mut b, mut skpv) = (Polyvec::new(),Polyvec::new()); - let (mut v, mut mp) = (Poly::new(),Poly::new()); - - unpack_ciphertext(&mut b, &mut v, c); - unpack_sk(&mut skpv, sk); - - polyvec_ntt(&mut b); - polyvec_basemul_acc_montgomery(&mut mp, &skpv, &b); - - poly_invntt_tomont(&mut mp); - poly_sub(&mut mp, &v); - poly_reduce(&mut mp); - - poly_tomsg(m, mp); -} \ No newline at end of file diff --git a/src/avx2/invntt.S b/src/avx2/invntt.S deleted file mode 100644 index 0cc7c6f..0000000 --- a/src/avx2/invntt.S +++ /dev/null @@ -1,195 +0,0 @@ -#include "consts.h" -.include "shuffle.inc" -.include "fq.inc" - -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3 -vpsubw %ymm\rl0,%ymm\rh0,%ymm12 -vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rl1,%ymm\rh1,%ymm13 - -vpmullw %ymm\zl0,%ymm12,%ymm\rh0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\rl2,%ymm\rh2,%ymm14 - -vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 -vpsubw %ymm\rl3,%ymm\rh3,%ymm15 - -vpmullw %ymm\zl1,%ymm14,%ymm\rh2 -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 -vpmullw %ymm\zl1,%ymm15,%ymm\rh3 - -vpmulhw %ymm\zh0,%ymm12,%ymm12 -vpmulhw %ymm\zh0,%ymm13,%ymm13 - -vpmulhw %ymm\zh1,%ymm14,%ymm14 -vpmulhw %ymm\zh1,%ymm15,%ymm15 - -vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 - -vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 - -vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 -vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 - -# - -# - -vpsubw %ymm\rh0,%ymm12,%ymm\rh0 - -vpsubw %ymm\rh1,%ymm13,%ymm\rh1 - -vpsubw %ymm\rh2,%ymm14,%ymm\rh2 -vpsubw %ymm\rh3,%ymm15,%ymm\rh3 -.endm - -.macro intt_levels0t5 off -/* level 0 */ -vmovdqa _16XFLO*2(%rsi),%ymm2 -vmovdqa _16XFHI*2(%rsi),%ymm3 - -vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 -vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 -vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 -vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 - -fqmulprecomp 2,3,4 -fqmulprecomp 2,3,6 -fqmulprecomp 2,3,5 -fqmulprecomp 2,3,7 - -vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 -vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 -vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 -vmovdqa (128*\off+112)*2(%rdi),%ymm11 - -fqmulprecomp 2,3,8 -fqmulprecomp 2,3,10 -fqmulprecomp 2,3,9 -fqmulprecomp 2,3,11 - -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15 -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1 -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2 -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3 -vmovdqa _REVIDXB*2(%rsi),%ymm12 -vpshufb %ymm12,%ymm15,%ymm15 -vpshufb %ymm12,%ymm1,%ymm1 -vpshufb %ymm12,%ymm2,%ymm2 -vpshufb %ymm12,%ymm3,%ymm3 - -butterfly 4,5,8,9,6,7,10,11,15,1,2,3 - -/* level 1 */ -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2 -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3 -vmovdqa _REVIDXB*2(%rsi),%ymm1 -vpshufb %ymm1,%ymm2,%ymm2 -vpshufb %ymm1,%ymm3,%ymm3 - -butterfly 4,5,6,7,8,9,10,11,2,2,3,3 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -/* level 2 */ -vmovdqa _REVIDXD*2(%rsi),%ymm12 -vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2 -vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10 - -butterfly 3,4,6,8,5,7,9,11,2,2,10,10 - -vmovdqa _16XV*2(%rsi),%ymm1 -red16 3 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -/* level 3 */ -vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2 -vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9 - -butterfly 10,3,6,5,4,8,7,11,2,2,9,9 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -/* level 4 */ -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2 -vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7 - -butterfly 9,10,6,4,3,5,8,11,2,2,7,7 - -red16 9 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -/* level 5 */ -vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2 -vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8 - -butterfly 7,9,6,3,10,4,5,11,2,2,8,8 - -vmovdqa %ymm7,(128*\off+ 0)*2(%rdi) -vmovdqa %ymm9,(128*\off+ 16)*2(%rdi) -vmovdqa %ymm6,(128*\off+ 32)*2(%rdi) -vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) -vmovdqa %ymm10,(128*\off+ 64)*2(%rdi) -vmovdqa %ymm4,(128*\off+ 80)*2(%rdi) -vmovdqa %ymm5,(128*\off+ 96)*2(%rdi) -vmovdqa %ymm11,(128*\off+112)*2(%rdi) -.endm - -.macro intt_level6 off -/* level 6 */ -vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 -vmovdqa (64*\off+128)*2(%rdi),%ymm8 -vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 -vmovdqa (64*\off+144)*2(%rdi),%ymm9 -vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2 - -vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 -vmovdqa (64*\off+160)*2(%rdi),%ymm10 -vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 -vmovdqa (64*\off+176)*2(%rdi),%ymm11 -vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3 - -butterfly 4,5,6,7,8,9,10,11 - -.if \off == 0 -red16 4 -.endif - -vmovdqa %ymm4,(64*\off+ 0)*2(%rdi) -vmovdqa %ymm5,(64*\off+ 16)*2(%rdi) -vmovdqa %ymm6,(64*\off+ 32)*2(%rdi) -vmovdqa %ymm7,(64*\off+ 48)*2(%rdi) -vmovdqa %ymm8,(64*\off+128)*2(%rdi) -vmovdqa %ymm9,(64*\off+144)*2(%rdi) -vmovdqa %ymm10,(64*\off+160)*2(%rdi) -vmovdqa %ymm11,(64*\off+176)*2(%rdi) -.endm - -.text -.global invntt_avx -.global _invntt_avx -invntt_avx: -_invntt_avx: -vmovdqa _16XQ*2(%rsi),%ymm0 - -intt_levels0t5 0 -intt_levels0t5 1 - -intt_level6 0 -intt_level6 1 -ret diff --git a/src/avx2/keccak4x.rs b/src/avx2/keccak4x.rs deleted file mode 100644 index db6eeb9..0000000 --- a/src/avx2/keccak4x.rs +++ /dev/null @@ -1,220 +0,0 @@ -// Macro and function code structure is the work of Marek Kotewicz -// plus contributors to the tiny-keccak crate licensed under -// Creative Commons CC0 1.0 Universal. Thankyou. -// https://github.com/debris/tiny-keccak - -// Copyright 2020-2021 Mitchell Berry -// Licensed under the Apache License, Version 2.0 - -// Drop-in Rust replacement for KeccakP-1600-times4 function for -// the eXtended Keccak Code Package https://github.com/XKCP/XKCP - -// Test vectors taken from: -// https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt - -use core::arch::x86_64::*; - -#[repr(C, align(32))] -union RC_Data { - vecs: [__m256i; 24], - u: [u64; 96] -} - -#[repr(C, align(32))] -union Temp { - pub vec: __m256i, - pub u: [u64; 4] -} - -const RHO: [u32; 24] = [ - 1, 3, 6, 10, 15, 21, 28, 36, - 45, 55, 2, 14, 27, 41, 56, 8, - 25, 43, 62, 18, 39, 61, 20, 44, -]; - -const PI: [usize; 24] = [ - 10, 7, 11, 17, 18, 3, 5, 16, - 8, 21, 24, 4, 15, 23, 19, 13, - 12, 2, 20, 14, 22, 9, 6, 1, -]; - -// Set __mm256i constants with a union -const RC_X4: RC_Data = RC_Data { u: [ - 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, - 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, - 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, - 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, - 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, - 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, - 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, - 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, - 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, - 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, - 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, - 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, - 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, - 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, - 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, - 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, - 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, - 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, - 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, - 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, - 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, - 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, - 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 -]}; - -macro_rules! unroll5 { - ($var:ident, $body:block) => { - { const $var: usize = 0; $body; } - { const $var: usize = 1; $body; } - { const $var: usize = 2; $body; } - { const $var: usize = 3; $body; } - { const $var: usize = 4; $body; } - }; -} - -macro_rules! unroll24 { - ($var: ident, $body: block) => { - { const $var: usize = 0; $body; } - { const $var: usize = 1; $body; } - { const $var: usize = 2; $body; } - { const $var: usize = 3; $body; } - { const $var: usize = 4; $body; } - { const $var: usize = 5; $body; } - { const $var: usize = 6; $body; } - { const $var: usize = 7; $body; } - { const $var: usize = 8; $body; } - { const $var: usize = 9; $body; } - { const $var: usize = 10; $body; } - { const $var: usize = 11; $body; } - { const $var: usize = 12; $body; } - { const $var: usize = 13; $body; } - { const $var: usize = 14; $body; } - { const $var: usize = 15; $body; } - { const $var: usize = 16; $body; } - { const $var: usize = 17; $body; } - { const $var: usize = 18; $body; } - { const $var: usize = 19; $body; } - { const $var: usize = 20; $body; } - { const $var: usize = 21; $body; } - { const $var: usize = 22; $body; } - { const $var: usize = 23; $body; } - }; -} - -#[allow(unused_assignments, non_upper_case_globals)] -pub fn f1600_x4(a: &mut [__m256i]) { - unsafe { - - for i in 0..24 { - let mut array = [_mm256_setzero_si256(); 5]; - - // Theta - unroll5!(x, { - unroll5!(y, { - array[x] = _mm256_xor_si256(array[x], a[5 * y + x]); - }); - }); - - unroll5!(x, { - unroll5!(y, { - let t1 = array[(x + 4) % 5]; - let mut t2 = Temp {vec: array[(x + 1) % 5]}; - for i in 0..4 { - t2.u[i] = t2.u[i].rotate_left(1); - } - a[5 * y + x] = _mm256_xor_si256(a[5 * y + x], _mm256_xor_si256(t1, t2.vec)); - }); - }); - - // Rho and pi - let mut last = a[1]; - unroll24!(x, { - array[0] = a[PI[x]]; - let mut temp_last = Temp {vec: last}; - for i in 0..4 { - temp_last.u[i] = temp_last.u[i].rotate_left(RHO[x]); - } - a[PI[x]] = temp_last.vec; - last = array[0]; - }); - - // Chi - unroll5!(y_step, { - let y = 5 * y_step; - - unroll5!(x, { - array[x] = a[y + x]; - }); - - unroll5!(x, { - let t1 = array[(x + 1) % 5]; - let t2 = array[(x + 2) % 5]; - let tmp = _mm256_xor_si256(array[x], _mm256_andnot_si256(t1, t2)); - a[y+x] = tmp; - }); - }); - a[0] = _mm256_xor_si256(a[0], RC_X4.vecs[i]); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - const PLEN: usize = 25; - // Test vectors from XKCP - // https://github.com/XKCP/XKCP/blob/master/tests/TestVectors/KeccakF-1600-IntermediateValues.txt - #[test] - fn known_vectors() { - let vec1: [u64; 25] = [ - 0xF1258F7940E1DDE7, 0x84D5CCF933C0478A, 0xD598261EA65AA9EE, 0xBD1547306F80494D, - 0x8B284E056253D057, 0xFF97A42D7F8E6FD4, 0x90FEE5A0A44647C4, 0x8C5BDA0CD6192E76, - 0xAD30A6F71B19059C, 0x30935AB7D08FFC64, 0xEB5AA93F2317D635, 0xA9A6E6260D712103, - 0x81A57C16DBCF555F, 0x43B831CD0347C826, 0x01F22F1A11A5569F, 0x05E5635A21D9AE61, - 0x64BEFEF28CC970F2, 0x613670957BC46611, 0xB87C5A554FD00ECB, 0x8C3EE88A1CCF32C8, - 0x940C7922AE3A2614, 0x1841F924A2C509E4, 0x16F53526E70465C2, 0x75F644E97F30A13B, - 0xEAF1FF7B5CECA249 - ]; - - let vec2: [u64; 25] = [ - 0x2D5C954DF96ECB3C, 0x6A332CD07057B56D, 0x093D8D1270D76B6C, 0x8A20D9B25569D094, - 0x4F9C4F99E5E7F156, 0xF957B9A2DA65FB38, 0x85773DAE1275AF0D, 0xFAF4F247C3D810F7, - 0x1F1B9EE6F79A8759, 0xE4FECC0FEE98B425, 0x68CE61B6B9CE68A1, 0xDEEA66C4BA8F974F, - 0x33C43D836EAFB1F5, 0xE00654042719DBD9, 0x7CF8A9F009831265, 0xFD5449A6BF174743, - 0x97DDAD33D8994B40, 0x48EAD5FC5D0BE774, 0xE3B8C8EE55B7B03C, 0x91A0226E649E42E9, - 0x900E3129E7BADD7B, 0x202A9EC5FAA3CCE8, 0x5B3402464E1C3DB6, 0x609F4E62A44C1059, - 0x20D06CD26A8FBF5C - ]; - - // repeat values to check all lanes - let tvec1 = expand(vec1); - let tvec2 = expand(vec2); - - unsafe { - let mut data = Data { u: [0u64;100] }; - f1600_x4(&mut data.lanes); - assert_eq!(&data.u , &tvec1); - f1600_x4(&mut data.lanes); - assert_eq!(data.u, tvec2); - } - - } - #[repr(C)] - pub union Data { - pub lanes: [__m256i; PLEN], - pub u: [u64; PLEN * 4] - } - - // [0,1...] expands to [0,0,0,0,1,1,1,1...] - fn expand(vec: [u64; PLEN]) -> [u64; 100] { - let mut out = [0u64; 100]; - for (i,u) in vec.iter().enumerate() { - out[i*4..][..4].copy_from_slice(&[*u; 4]); - } - out - } -} diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs deleted file mode 100644 index 8b9b972..0000000 --- a/src/avx2/mod.rs +++ /dev/null @@ -1,13 +0,0 @@ - -pub mod aes256ctr; -pub mod align; -pub mod cbd; -pub mod consts; -pub mod fips202; -pub mod fips202x4; -pub mod indcpa; -pub mod keccak4x; -pub mod poly; -pub mod polyvec; -pub mod rejsample; -pub mod verify; \ No newline at end of file diff --git a/src/avx2/nasm/basemul.asm b/src/avx2/nasm/basemul.asm deleted file mode 100644 index 172bb68..0000000 --- a/src/avx2/nasm/basemul.asm +++ /dev/null @@ -1,107 +0,0 @@ -%include "consts.inc" - -%macro schoolbook 1 -vmovdqa ymm0,[rcx + _16XQINV*2] -vmovdqa ymm1,[rsi + (64*%1+ 0)*2] ; a0 -vmovdqa ymm2,[rsi + (64*%1+16)*2] ; b0 -vmovdqa ymm3,[rsi + (64*%1+32)*2] ; a1 -vmovdqa ymm4,[rsi + (64*%1+48)*2] ; b1 - -vpmullw ymm9,ymm1,ymm0 ; a0.lo -vpmullw ymm10,ymm2,ymm0 ; b0.lo -vpmullw ymm11,ymm3,ymm0 ; a1.lo -vpmullw ymm12,ymm4,ymm0 ; b1.lo - -vmovdqa ymm5,[rdx + (64*%1+ 0)*2] ; c0 -vmovdqa ymm6,[rdx + (64*%1+ 16)*2] ; d0 - -vpmulhw ymm13,ymm1,ymm5 ; a0c0.hi -vpmulhw ymm1,ymm1,ymm6 ; a0d0.hi -vpmulhw ymm14,ymm2,ymm5 ; b0c0.hi -vpmulhw ymm2,ymm2,ymm6 ; b0d0.hi - -vmovdqa ymm7,[rdx + (64*%1+ 32)*2] ; c1 -vmovdqa ymm8,[rdx + (64*%1+ 48)*2] ; d1 - -vpmulhw ymm15,ymm3,ymm7 ; a1c1.hi -vpmulhw ymm3,ymm3,ymm8 ; a1d1.hi -vpmulhw ymm0,ymm4,ymm7 ; b1c1.hi -vpmulhw ymm4,ymm4,ymm8 ; b1d1.hi - -vmovdqa [rsp],ymm13 - -vpmullw ymm13,ymm9,ymm5 ; a0c0.lo -vpmullw ymm9,ymm9,ymm6 ; a0d0.lo -vpmullw ymm5,ymm10,ymm5 ; b0c0.lo -vpmullw ymm10,ymm10,ymm6 ; b0d0.lo - -vpmullw ymm6,ymm11,ymm7 ; a1c1.lo -vpmullw ymm11,ymm11,ymm8 ; a1d1.lo -vpmullw ymm7,ymm12,ymm7 ; b1c1.lo -vpmullw ymm12,ymm12,ymm8 ; b1d1.lo - -vmovdqa ymm8,[rcx + _16XQ*2] -vpmulhw ymm13,ymm13,ymm8 -vpmulhw ymm9,ymm9,ymm8 -vpmulhw ymm5,ymm5,ymm8 -vpmulhw ymm10,ymm10,ymm8 -vpmulhw ymm6,ymm6,ymm8 -vpmulhw ymm11,ymm11,ymm8 -vpmulhw ymm7,ymm7,ymm8 -vpmulhw ymm12,ymm12,ymm8 - -vpsubw ymm13,ymm13,[rsp] ; -a0c0 -vpsubw ymm9,ymm1,ymm9 ; a0d0 -vpsubw ymm5,ymm14,ymm5 ; b0c0 -vpsubw ymm10,ymm2,ymm10 ; b0d0 - -vpsubw ymm6,ymm15,ymm6 ; a1c1 -vpsubw ymm11,ymm3,ymm11 ; a1d1 -vpsubw ymm7,ymm0,ymm7 ; b1c1 -vpsubw ymm12,ymm4,ymm12 ; b1d1 - -vmovdqa ymm0,[r9] -vmovdqa ymm1,[r9 + 32] -vpmullw ymm2,ymm10,ymm0 -vpmullw ymm3,ymm12,ymm0 -vpmulhw ymm10,ymm10,ymm1 -vpmulhw ymm12,ymm12,ymm1 -vpmulhw ymm2,ymm2,ymm8 -vpmulhw ymm3,ymm3,ymm8 -vpsubw ymm10,ymm10,ymm2 ; rb0d0 -vpsubw ymm12,ymm12,ymm3 ; rb1d1 - -vpaddw ymm9,ymm9,ymm5 -vpaddw ymm11,ymm11,ymm7 -vpsubw ymm13,ymm10,ymm13 -vpsubw ymm6,ymm6,ymm12 - -vmovdqa [rdi + (64*%1+ 0)*2],ymm13 -vmovdqa [rdi + (64*%1+ 16)*2],ymm9 -vmovdqa [rdi + (64*%1+ 32)*2],ymm6 -vmovdqa [rdi + (64*%1+ 48)*2],ymm11 -%endmacro - -SECTION .text -global basemul_avx -global _basemul_avx -basemul_avx: -_basemul_avx: -mov r8,rsp -and rsp,-32 -sub rsp,32 - -lea r9,[rcx + (_ZETAS_EXP+176)*2] -schoolbook 0 - -add r9,32*2 -schoolbook 1 - -add r9,192*2 -schoolbook 2 - -add r9,32*2 -schoolbook 3 - -mov rsp,r8 -ret diff --git a/src/avx2/nasm/consts.inc b/src/avx2/nasm/consts.inc deleted file mode 100644 index 524e24e..0000000 --- a/src/avx2/nasm/consts.inc +++ /dev/null @@ -1,12 +0,0 @@ -%define _16XQ 0 -%define _16XQINV 16 -%define _16XV 32 -%define _16XFLO 48 -%define _16XFHI 64 -%define _16XMONTSQLO 80 -%define _16XMONTSQHI 96 -%define _16XMASK 112 -%define _REVIDXB 128 -%define _REVIDXD 144 -%define _ZETAS_EXP 160 -%define _16XSHIFT 624 \ No newline at end of file diff --git a/src/avx2/nasm/fq.asm b/src/avx2/nasm/fq.asm deleted file mode 100644 index 535bcb2..0000000 --- a/src/avx2/nasm/fq.asm +++ /dev/null @@ -1,92 +0,0 @@ -%include "fq.inc" -%include "consts.inc" - -SECTION .text -reduce128_avx: -;load -vmovdqa ymm2,[rdi] -vmovdqa ymm3,[rdi + 32] -vmovdqa ymm4,[rdi + 64] -vmovdqa ymm5,[rdi + 96] -vmovdqa ymm6,[rdi + 128] -vmovdqa ymm7,[rdi + 160] -vmovdqa ymm8,[rdi + 192] -vmovdqa ymm9,[rdi + 224] - -red16 2 -red16 3 -red16 4 -red16 5 -red16 6 -red16 7 -red16 8 -red16 9 - -;store -vmovdqa [rdi],ymm2 -vmovdqa [rdi + 32],ymm3 -vmovdqa [rdi + 64],ymm4 -vmovdqa [rdi + 96],ymm5 -vmovdqa [rdi + 128],ymm6 -vmovdqa [rdi + 160],ymm7 -vmovdqa [rdi + 192],ymm8 -vmovdqa [rdi + 224],ymm9 - -ret - -global reduce_avx -global _reduce_avx -reduce_avx: -_reduce_avx: -;consts -vmovdqa ymm0,[rsi + _16XQ*2] -vmovdqa ymm1,[rsi + _16XV*2] -call reduce128_avx -add rdi,256 -call reduce128_avx -ret - -tomont128_avx: -;load -vmovdqa ymm3,[rdi] -vmovdqa ymm4,[rdi + 32] -vmovdqa ymm5,[rdi + 64] -vmovdqa ymm6,[rdi + 96] -vmovdqa ymm7,[rdi + 128] -vmovdqa ymm8,[rdi + 160] -vmovdqa ymm9,[rdi + 192] -vmovdqa ymm10,[rdi + 224] - -fqmulprecomp 1,2,3,11 -fqmulprecomp 1,2,4,12 -fqmulprecomp 1,2,5,13 -fqmulprecomp 1,2,6,14 -fqmulprecomp 1,2,7,15 -fqmulprecomp 1,2,8,11 -fqmulprecomp 1,2,9,12 -fqmulprecomp 1,2,10,13 - -;store -vmovdqa [rdi],ymm3 -vmovdqa [rdi + 32],ymm4 -vmovdqa [rdi + 64],ymm5 -vmovdqa [rdi + 96],ymm6 -vmovdqa [rdi + 128],ymm7 -vmovdqa [rdi + 160],ymm8 -vmovdqa [rdi + 192],ymm9 -vmovdqa [rdi + 224],ymm10 - -ret - -global tomont_avx -global _tomont_avx -tomont_avx: -_tomont_avx: -;consts -vmovdqa ymm0,[rsi + _16XQ*2] -vmovdqa ymm1,[rsi + _16XMONTSQLO*2] -vmovdqa ymm2,[rsi + _16XMONTSQHI*2] -call tomont128_avx -add rdi,256 -call tomont128_avx -ret diff --git a/src/avx2/nasm/fq.inc b/src/avx2/nasm/fq.inc deleted file mode 100644 index fc55981..0000000 --- a/src/avx2/nasm/fq.inc +++ /dev/null @@ -1,20 +0,0 @@ -%macro red16 1 -vpmulhw ymm12,ymm%1,ymm1 -vpsraw ymm12,ymm12,10 -vpmullw ymm12,ymm12,ymm0 -vpsubw ymm%1,ymm%1,ymm12 -%endmacro - -%macro csubq 2 -vpsubw ymm%1,ymm%1,ymm0 -vpsraw ymm%2,ymm%1,15 -vpand ymm%2,ymm%2,ymm0 -vpaddw ymm%1,ymm%1,ymm%2 -%endmacro - -%macro fqmulprecomp 3-4 12 -vpmullw ymm%4,ymm%3,ymm%1 -vpmulhw ymm%3,ymm%3,ymm%2 -vpmulhw ymm%4,ymm%4,ymm0 -vpsubw ymm%3,ymm%3,ymm%4 -%endmacro diff --git a/src/avx2/nasm/invntt.asm b/src/avx2/nasm/invntt.asm deleted file mode 100644 index 4a66d03..0000000 --- a/src/avx2/nasm/invntt.asm +++ /dev/null @@ -1,191 +0,0 @@ -%include "shuffle.inc" -%include "fq.inc" -%include "consts.inc" - -%macro butterfly 8-12 2,2,3,3 -vpsubw ymm12,ymm%5,ymm%1 -vpaddw ymm%1,ymm%1,ymm%5 -vpsubw ymm13,ymm%6,ymm%2 - -vpmullw ymm%5,ymm12,ymm%9 -vpaddw ymm%2,ymm%2,ymm%6 -vpsubw ymm14,ymm%7,ymm%3 - -vpmullw ymm%6,ymm13,ymm%9 -vpaddw ymm%3,ymm%3,ymm%7 -vpsubw ymm15,ymm%8,ymm%4 - -vpmullw ymm%7,ymm14,ymm%10 -vpaddw ymm%4,ymm%4,ymm%8 -vpmullw ymm%8,ymm15,ymm%10 - -vpmulhw ymm12,ymm12,ymm%11 -vpmulhw ymm13,ymm13,ymm%11 - -vpmulhw ymm14,ymm14,ymm%12 -vpmulhw ymm15,ymm15,ymm%12 - -vpmulhw ymm%5,ymm%5,ymm0 - -vpmulhw ymm%6,ymm%6,ymm0 - -vpmulhw ymm%7,ymm%7,ymm0 -vpmulhw ymm%8,ymm%8,ymm0 - -vpsubw ymm%5,ymm12,ymm%5 - -vpsubw ymm%6,ymm13,ymm%6 - -vpsubw ymm%7,ymm14,ymm%7 -vpsubw ymm%8,ymm15,ymm%8 -%endmacro - -%macro intt_levels0t5 1 -; level 0 -vmovdqa ymm2,[rsi + _16XFLO*2] -vmovdqa ymm3,[rsi + _16XFHI*2] - -vmovdqa ymm4,[rdi + (128*%1+ 0)*2] -vmovdqa ymm6,[rdi + (128*%1+ 32)*2] -vmovdqa ymm5,[rdi + (128*%1+ 16)*2] -vmovdqa ymm7,[rdi + (128*%1+ 48)*2] - -fqmulprecomp 2,3,4 -fqmulprecomp 2,3,6 -fqmulprecomp 2,3,5 -fqmulprecomp 2,3,7 - -vmovdqa ymm8,[rdi + (128*%1+ 64)*2] -vmovdqa ymm10,[rdi + (128*%1+ 96)*2] -vmovdqa ymm9,[rdi + (128*%1+ 80)*2] -vmovdqa ymm11,[rdi + (128*%1+ 112)*2] - -fqmulprecomp 2,3,8 -fqmulprecomp 2,3,10 -fqmulprecomp 2,3,9 -fqmulprecomp 2,3,11 - -vpermq ymm15,[rsi + (_ZETAS_EXP+(1-%1)*224+208)*2],04Eh -vpermq ymm1,[rsi + (_ZETAS_EXP+(1-%1)*224+176)*2],04Eh -vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+224)*2],04Eh -vpermq ymm3,[rsi + (_ZETAS_EXP+(1-%1)*224+192)*2],04Eh -vmovdqa ymm12,[rsi + _REVIDXB*2] -vpshufb ymm15,ymm15,ymm12 -vpshufb ymm1,ymm1,ymm12 -vpshufb ymm2,ymm2,ymm12 -vpshufb ymm3,ymm3,ymm12 - -butterfly 4,5,8,9,6,7,10,11,15,1,2,3 - -; level 1 -vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+144)*2],04Eh -vpermq ymm3,[rsi + (_ZETAS_EXP+(1-%1)*224+160)*2],04Eh -vmovdqa ymm1,[rsi + _REVIDXB*2] -vpshufb ymm2,ymm2,ymm1 -vpshufb ymm3,ymm3,ymm1 - -butterfly 4,5,6,7,8,9,10,11,2,2,3,3 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -; level 2 -vmovdqa ymm12,[rsi + _REVIDXD*2] -vpermd ymm2,ymm12,[rsi + (_ZETAS_EXP+(1-%1)*224+112)*2] -vpermd ymm10,ymm12,[rsi + (_ZETAS_EXP+(1-%1)*224+128)*2] - -butterfly 3,4,6,8,5,7,9,11,2,2,10,10 - -vmovdqa ymm1,[rsi + _16XV*2] -red16 3 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -; level 3 -vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+80)*2],01Bh -vpermq ymm9,[rsi + (_ZETAS_EXP+(1-%1)*224+96)*2],01Bh - -butterfly 10,3,6,5,4,8,7,11,2,2,9,9 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -; level 4 -vpermq ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+48)*2],04Eh -vpermq ymm7,[rsi + (_ZETAS_EXP+(1-%1)*224+64)*2],04Eh - -butterfly 9,10,6,4,3,5,8,11,2,2,7,7 - -red16 9 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -; level 5 -vmovdqa ymm2,[rsi + (_ZETAS_EXP+(1-%1)*224+16)*2] -vmovdqa ymm8,[rsi + (_ZETAS_EXP+(1-%1)*224+32)*2] - -butterfly 7,9,6,3,10,4,5,11,2,2,8,8 - -vmovdqa [rdi + (128*%1 + 0)*2],ymm7 -vmovdqa [rdi + (128*%1 + 16)*2],ymm9 -vmovdqa [rdi + (128*%1 + 32)*2],ymm6 -vmovdqa [rdi + (128*%1 + 48)*2],ymm3 -vmovdqa [rdi + (128*%1 + 64)*2],ymm10 -vmovdqa [rdi + (128*%1 + 80)*2],ymm4 -vmovdqa [rdi + (128*%1 + 96)*2],ymm5 -vmovdqa [rdi + (128*%1 + 112)*2],ymm11 -%endmacro - -%macro intt_level6 1 -; level 6 -vmovdqa ymm4,[rdi + (64*%1+ 0)*2] -vmovdqa ymm8,[rdi + (64*%1+ 128)*2] -vmovdqa ymm5,[rdi + (64*%1+ 16)*2] -vmovdqa ymm9,[rdi + (64*%1+ 144)*2] -vpbroadcastq ymm2,[rsi + (_ZETAS_EXP+0)*2] - -vmovdqa ymm6,[rdi + (64*%1+ 32)*2] -vmovdqa ymm10,[rdi + (64*%1+ 160)*2] -vmovdqa ymm7,[rdi + (64*%1+ 48)*2] -vmovdqa ymm11,[rdi + (64*%1+ 176)*2] -vpbroadcastq ymm3,[rsi + (_ZETAS_EXP+4)*2] - -butterfly 4,5,6,7,8,9,10,11 - -%if %1 == 0 -red16 4 -%endif - -vmovdqa [rdi + (64*%1+ 0)*2],ymm4 -vmovdqa [rdi + (64*%1+ 16)*2],ymm5 -vmovdqa [rdi + (64*%1+ 32)*2],ymm6 -vmovdqa [rdi + (64*%1+ 48)*2],ymm7 -vmovdqa [rdi + (64*%1+ 128)*2],ymm8 -vmovdqa [rdi + (64*%1+ 144)*2],ymm9 -vmovdqa [rdi + (64*%1+ 160)*2],ymm10 -vmovdqa [rdi + (64*%1+ 176)*2],ymm11 -%endmacro - -SECTION .text -global invntt_avx -global _invntt_avx -invntt_avx: -_invntt_avx: -vmovdqa ymm0,[rsi + _16XQ*2] - -intt_levels0t5 0 -intt_levels0t5 1 - -intt_level6 0 -intt_level6 1 -ret diff --git a/src/avx2/nasm/ntt.asm b/src/avx2/nasm/ntt.asm deleted file mode 100644 index c61b8bd..0000000 --- a/src/avx2/nasm/ntt.asm +++ /dev/null @@ -1,191 +0,0 @@ -%include "shuffle.inc" -%include "consts.inc" - -%macro mul 4-8 15,15,2,2 -vpmullw ymm12,ymm%1,ymm%5 -vpmullw ymm13,ymm%2,ymm%5 - -vpmullw ymm14,ymm%3,ymm%6 -vpmullw ymm15,ymm%4,ymm%6 - -vpmulhw ymm%1,ymm%1,ymm%7 -vpmulhw ymm%2,ymm%2,ymm%7 - -vpmulhw ymm%3,ymm%3,ymm%8 -vpmulhw ymm%4,ymm%4,ymm%8 -%endmacro - -%macro reduce 0 -vpmulhw ymm12,ymm12,ymm0 -vpmulhw ymm13,ymm13,ymm0 - -vpmulhw ymm14,ymm14,ymm0 -vpmulhw ymm15,ymm15,ymm0 -%endmacro - -%macro update 9 -vpaddw ymm%1,ymm%2,ymm%6 -vpsubw ymm%6,ymm%2,ymm%6 -vpaddw ymm%2,ymm%3,ymm%7 - -vpsubw ymm%7,ymm%3,ymm%7 -vpaddw ymm%3,ymm%4,ymm%8 -vpsubw ymm%8,ymm%4,ymm%8 - -vpaddw ymm%4,ymm%5,ymm%9 -vpsubw ymm%9,ymm%5,ymm%9 - -vpsubw ymm%1,ymm%1,ymm12 -vpaddw ymm%6,ymm%6,ymm12 -vpsubw ymm%2,ymm%2,ymm13 - -vpaddw ymm%7,ymm%7,ymm13 -vpsubw ymm%3,ymm%3,ymm14 -vpaddw ymm%8,ymm%8,ymm14 - -vpsubw ymm%4,ymm%4,ymm15 -vpaddw ymm%9,ymm%9,ymm15 -%endmacro - -%macro level0 1 -vpbroadcastq ymm15,[rsi+ (_ZETAS_EXP+0)*2] -vmovdqa ymm8,[rdi + (64*%1+ 128)*2] -vmovdqa ymm9,[rdi + (64*%1+ 144)*2] -vmovdqa ymm10,[rdi + (64*%1+ 160)*2] -vmovdqa ymm11,[rdi + (64*%1+ 176)*2] -vpbroadcastq ymm2,[rsi+ (_ZETAS_EXP+4)*2] - -mul 8,9,10,11 - -vmovdqa ymm4,[rdi + (64*%1+ 0)*2] -vmovdqa ymm5,[rdi + (64*%1+ 16)*2] -vmovdqa ymm6,[rdi + (64*%1+ 32)*2] -vmovdqa ymm7,[rdi + (64*%1+ 48)*2] - -reduce -update 3,4,5,6,7,8,9,10,11 - -vmovdqa [rdi + (64*%1+ 0)*2],ymm3 -vmovdqa [rdi + (64*%1+ 16)*2],ymm4 -vmovdqa [rdi + (64*%1+ 32)*2],ymm5 -vmovdqa [rdi + (64*%1+ 48)*2],ymm6 -vmovdqa [rdi + (64*%1+ 128)*2],ymm8 -vmovdqa [rdi + (64*%1+ 144)*2],ymm9 -vmovdqa [rdi + (64*%1+ 160)*2],ymm10 -vmovdqa [rdi + (64*%1+ 176)*2],ymm11 -%endmacro - -%macro levels1t6 1 -; level 1 -vmovdqa ymm15,[rsi+ (_ZETAS_EXP+224*%1+16)*2] -vmovdqa ymm8,[rdi + (128*%1+ 64)*2] -vmovdqa ymm9,[rdi + (128*%1+ 80)*2] -vmovdqa ymm10,[rdi + (128*%1+ 96)*2] -vmovdqa ymm11,[rdi + (128*%1+ 112)*2] -vmovdqa ymm2,[rsi+ (_ZETAS_EXP+224*%1+32)*2] - -mul 8,9,10,11 - -vmovdqa ymm4,[rdi + (128*%1+ 0)*2] -vmovdqa ymm5,[rdi + (128*%1+ 16)*2] -vmovdqa ymm6,[rdi + (128*%1+ 32)*2] -vmovdqa ymm7,[rdi + (128*%1+ 48)*2] - -reduce -update 3,4,5,6,7,8,9,10,11 - -; level 2 -shuffle8 5,10,7,10 -shuffle8 6,11,5,11 - -vmovdqa ymm15,[rsi+ (_ZETAS_EXP+224*%1+48)*2] -vmovdqa ymm2,[rsi+ (_ZETAS_EXP+224*%1+64)*2] - -mul 7,10,5,11 - -shuffle8 3,8,6,8 -shuffle8 4,9,3,9 - -reduce -update 4,6,8,3,9,7,10,5,11 - -; level 3 -shuffle4 8,5,9,5 -shuffle4 3,11,8,11 - -vmovdqa ymm15,[rsi+ (_ZETAS_EXP+224*%1+80)*2] -vmovdqa ymm2,[rsi+ (_ZETAS_EXP+224*%1+96)*2] - -mul 9,5,8,11 - -shuffle4 4,7,3,7 -shuffle4 6,10,4,10 - -reduce -update 6,3,7,4,10,9,5,8,11 - -; level 4 -shuffle2 7,8,10,8 -shuffle2 4,11,7,11 - -vmovdqa ymm15,[rsi+ (_ZETAS_EXP+224*%1+112)*2] -vmovdqa ymm2,[rsi+ (_ZETAS_EXP+224*%1+128)*2] - -mul 10,8,7,11 - -shuffle2 6,9,4,9 -shuffle2 3,5,6,5 - -reduce -update 3,4,9,6,5,10,8,7,11 - -; level 5 -shuffle1 9,7,5,7 -shuffle1 6,11,9,11 - -vmovdqa ymm15,[rsi+ (_ZETAS_EXP+224*%1+144)*2] -vmovdqa ymm2,[rsi+ (_ZETAS_EXP+224*%1+160)*2] - -mul 5,7,9,11 - -shuffle1 3,10,6,10 -shuffle1 4,8,3,8 - -reduce -update 4,6,10,3,8,5,7,9,11 - -; level 6 -vmovdqa ymm14,[rsi+ (_ZETAS_EXP+224*%1+176)*2] -vmovdqa ymm15,[rsi+ (_ZETAS_EXP+224*%1+208)*2] -vmovdqa ymm8,[rsi+ (_ZETAS_EXP+224*%1+192)*2] -vmovdqa ymm2,[rsi+ (_ZETAS_EXP+224*%1+224)*2] - -mul 10,3,9,11,14,15,8,2 - -reduce -update 8,4,6,5,7,10,3,9,11 - -vmovdqa [rdi + (128*%1+ 0)*2],ymm8 -vmovdqa [rdi + (128*%1+ 16)*2],ymm4 -vmovdqa [rdi + (128*%1+ 32)*2],ymm10 -vmovdqa [rdi + (128*%1+ 48)*2],ymm3 -vmovdqa [rdi + (128*%1+ 64)*2],ymm6 -vmovdqa [rdi + (128*%1+ 80)*2],ymm5 -vmovdqa [rdi + (128*%1+ 96)*2],ymm9 -vmovdqa [rdi + (128*%1+ 112)*2],ymm11 -%endmacro - -SECTION .text -global ntt_avx -global _ntt_avx -ntt_avx: -_ntt_avx: -vmovdqa ymm0,[rsi + _16XQ*2] - -level0 0 -level0 1 - -levels1t6 0 -levels1t6 1 - -ret diff --git a/src/avx2/nasm/shuffle.asm b/src/avx2/nasm/shuffle.asm deleted file mode 100644 index d055a27..0000000 --- a/src/avx2/nasm/shuffle.asm +++ /dev/null @@ -1,216 +0,0 @@ -%include "fq.inc" -%include "shuffle.inc" -%include "consts.inc" - -SECTION .text -nttunpack128_avx: -;load -vmovdqa ymm4,[rdi] -vmovdqa ymm5,[rdi + 32] -vmovdqa ymm6,[rdi + 64] -vmovdqa ymm7,[rdi + 96] -vmovdqa ymm8,[rdi + 128] -vmovdqa ymm9,[rdi + 160] -vmovdqa ymm10,[rdi + 192] -vmovdqa ymm11,[rdi + 224] - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -;store -vmovdqa [rdi],ymm10 -vmovdqa [rdi + 32],ymm5 -vmovdqa [rdi + 64],ymm9 -vmovdqa [rdi + 96],ymm4 -vmovdqa [rdi + 128],ymm8 -vmovdqa [rdi + 160],ymm3 -vmovdqa [rdi + 192],ymm7 -vmovdqa [rdi + 224],ymm11 - -ret - -global nttunpack_avx -global _nttunpack_avx -nttunpack_avx: -_nttunpack_avx: -call nttunpack128_avx -add rdi,256 -call nttunpack128_avx -ret - -ntttobytes128_avx: -;load -vmovdqa ymm5,[rsi] -vmovdqa ymm6,[rsi + 32] -vmovdqa ymm7,[rsi + 64] -vmovdqa ymm8,[rsi + 96] -vmovdqa ymm9,[rsi + 128] -vmovdqa ymm10,[rsi + 160] -vmovdqa ymm11,[rsi + 192] -vmovdqa ymm12,[rsi + 224] - -;csubq -csubq 5,13 -csubq 6,13 -csubq 7,13 -csubq 8,13 -csubq 9,13 -csubq 10,13 -csubq 11,13 -csubq 12,13 - -;bitpack -vpsllw ymm4,ymm6,12 -vpor ymm4,ymm5,ymm4 - -vpsrlw ymm5,ymm6,4 -vpsllw ymm6,ymm7,8 -vpor ymm5,ymm6,ymm5 - -vpsrlw ymm6,ymm7,8 -vpsllw ymm7,ymm8,4 -vpor ymm6,ymm7,ymm6 - -vpsllw ymm7,ymm10,12 -vpor ymm7,ymm9,ymm7 - -vpsrlw ymm8,ymm10,4 -vpsllw ymm9,ymm11,8 -vpor ymm8,ymm9,ymm8 - -vpsrlw ymm9,ymm11,8 -vpsllw ymm10,ymm12,4 -vpor ymm9,ymm10,ymm9 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 - -shuffle2 3,4,8,4 -shuffle2 6,5,3,5 -shuffle2 7,9,6,9 - -shuffle4 8,3,7,3 -shuffle4 6,4,8,4 -shuffle4 5,9,6,9 - -shuffle8 7,8,5,8 -shuffle8 6,3,7,3 -shuffle8 4,9,6,9 - -;store -vmovdqu [rdi],ymm5 -vmovdqu [rdi + 32],ymm7 -vmovdqu [rdi + 64],ymm6 -vmovdqu [rdi + 96],ymm8 -vmovdqu [rdi + 128],ymm3 -vmovdqu [rdi + 160],ymm9 - -ret - -global ntttobytes_avx -global _ntttobytes_avx -ntttobytes_avx: -_ntttobytes_avx: -;consts -vmovdqa ymm0,[rdx + _16XQ*2] -call ntttobytes128_avx -add rsi,256 -add rdi,192 -call ntttobytes128_avx -ret - -nttfrombytes128_avx: -;load -vmovdqu ymm4,[rsi] -vmovdqu ymm5,[rsi + 32] -vmovdqu ymm6,[rsi + 64] -vmovdqu ymm7,[rsi + 96] -vmovdqu ymm8,[rsi + 128] -vmovdqu ymm9,[rsi + 160] - -shuffle8 4,7,3,7 -shuffle8 5,8,4,8 -shuffle8 6,9,5,9 - -shuffle4 3,8,6,8 -shuffle4 7,5,3,5 -shuffle4 4,9,7,9 - -shuffle2 6,5,4,5 -shuffle2 8,7,6,7 -shuffle2 3,9,8,9 - -shuffle1 4,7,10,7 -shuffle1 5,8,4,8 -shuffle1 6,9,5,9 - -;bitunpack -vpsrlw ymm11,ymm10,12 -vpsllw ymm12,ymm7,4 -vpor ymm11,ymm12,ymm11 -vpand ymm10,ymm10,ymm0 -vpand ymm11,ymm11,ymm0 - -vpsrlw ymm12,ymm7,8 -vpsllw ymm13,ymm4,8 -vpor ymm12,ymm13,ymm12 -vpand ymm12,ymm12,ymm0 - -vpsrlw ymm13,ymm4,4 -vpand ymm13,ymm13,ymm0 - -vpsrlw ymm14,ymm8,12 -vpsllw ymm15,ymm5,4 -vpor ymm14,ymm15,ymm14 -vpand ymm8,ymm8,ymm0 -vpand ymm14,ymm14,ymm0 - -vpsrlw ymm15,ymm5,8 -vpsllw ymm1,ymm9,8 -vpor ymm15,ymm1,ymm15 -vpand ymm15,ymm15,ymm0 - -vpsrlw ymm1,ymm9,4 -vpand ymm1,ymm1,ymm0 - -;store -vmovdqa [rdi],ymm10 -vmovdqa [rdi + 32],ymm11 -vmovdqa [rdi + 64],ymm12 -vmovdqa [rdi + 96],ymm13 -vmovdqa [rdi + 128],ymm8 -vmovdqa [rdi + 160],ymm14 -vmovdqa [rdi + 192],ymm15 -vmovdqa [rdi + 224],ymm1 - -ret - -global nttfrombytes_avx -global _nttfrombytes_avx -nttfrombytes_avx: -_nttfrombytes_avx: -;consts -vmovdqa ymm0,[rdx + _16XMASK*2] -call nttfrombytes128_avx -add rdi,256 -add rsi,192 -call nttfrombytes128_avx -ret diff --git a/src/avx2/nasm/shuffle.inc b/src/avx2/nasm/shuffle.inc deleted file mode 100644 index ecbca80..0000000 --- a/src/avx2/nasm/shuffle.inc +++ /dev/null @@ -1,25 +0,0 @@ -%macro shuffle8 4 -vperm2i128 ymm%3,ymm%1,ymm%2,020h -vperm2i128 ymm%4,ymm%1,ymm%2,031h -%endmacro - -%macro shuffle4 4 -vpunpcklqdq ymm%3,ymm%1,ymm%2 -vpunpckhqdq ymm%4,ymm%1,ymm%2 -%endmacro - -%macro shuffle2 4 -;vpsllq ymm%3,ymm%2,32 -vmovsldup ymm%3,ymm%2 -vpblendd ymm%3,ymm%1,ymm%3,0AAh -vpsrlq ymm%1,ymm%1,32 -;vmovshdup ymm%1,ymm%1 -vpblendd ymm%4,ymm%1,ymm%2,0AAh -%endmacro - -%macro shuffle1 4 -vpslld ymm%3,ymm%2,16 -vpblendw ymm%3,ymm%1,ymm%3,0AAh -vpsrld ymm%1,ymm%1,16 -vpblendw ymm%4,ymm%1,ymm%2,0AAh -%endmacro diff --git a/src/avx2/ntt.S b/src/avx2/ntt.S deleted file mode 100644 index d402331..0000000 --- a/src/avx2/ntt.S +++ /dev/null @@ -1,191 +0,0 @@ -#include "consts.h" -.include "shuffle.inc" - -.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2 -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 - -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 - -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 - -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 -.endm - -.macro reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 - -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -.endm - -.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 -vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln -vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0 - -vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1 -vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2 - -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2 -vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3 - -vpsubw %ymm12,%ymm\rln,%ymm\rln -vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm13,%ymm\rl0,%ymm\rl0 - -vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm14,%ymm\rl1,%ymm\rl1 -vpaddw %ymm14,%ymm\rh2,%ymm\rh2 - -vpsubw %ymm15,%ymm\rl2,%ymm\rl2 -vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -.endm - -.macro level0 off -vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15 -vmovdqa (64*\off+128)*2(%rdi),%ymm8 -vmovdqa (64*\off+144)*2(%rdi),%ymm9 -vmovdqa (64*\off+160)*2(%rdi),%ymm10 -vmovdqa (64*\off+176)*2(%rdi),%ymm11 -vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2 - -mul 8,9,10,11 - -vmovdqa (64*\off+ 0)*2(%rdi),%ymm4 -vmovdqa (64*\off+ 16)*2(%rdi),%ymm5 -vmovdqa (64*\off+ 32)*2(%rdi),%ymm6 -vmovdqa (64*\off+ 48)*2(%rdi),%ymm7 - -reduce -update 3,4,5,6,7,8,9,10,11 - -vmovdqa %ymm3,(64*\off+ 0)*2(%rdi) -vmovdqa %ymm4,(64*\off+ 16)*2(%rdi) -vmovdqa %ymm5,(64*\off+ 32)*2(%rdi) -vmovdqa %ymm6,(64*\off+ 48)*2(%rdi) -vmovdqa %ymm8,(64*\off+128)*2(%rdi) -vmovdqa %ymm9,(64*\off+144)*2(%rdi) -vmovdqa %ymm10,(64*\off+160)*2(%rdi) -vmovdqa %ymm11,(64*\off+176)*2(%rdi) -.endm - -.macro levels1t6 off -/* level 1 */ -vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15 -vmovdqa (128*\off+ 64)*2(%rdi),%ymm8 -vmovdqa (128*\off+ 80)*2(%rdi),%ymm9 -vmovdqa (128*\off+ 96)*2(%rdi),%ymm10 -vmovdqa (128*\off+112)*2(%rdi),%ymm11 -vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2 - -mul 8,9,10,11 - -vmovdqa (128*\off+ 0)*2(%rdi),%ymm4 -vmovdqa (128*\off+ 16)*2(%rdi),%ymm5 -vmovdqa (128*\off+ 32)*2(%rdi),%ymm6 -vmovdqa (128*\off+ 48)*2(%rdi),%ymm7 - -reduce -update 3,4,5,6,7,8,9,10,11 - -/* level 2 */ -shuffle8 5,10,7,10 -shuffle8 6,11,5,11 - -vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15 -vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2 - -mul 7,10,5,11 - -shuffle8 3,8,6,8 -shuffle8 4,9,3,9 - -reduce -update 4,6,8,3,9,7,10,5,11 - -/* level 3 */ -shuffle4 8,5,9,5 -shuffle4 3,11,8,11 - -vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15 -vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2 - -mul 9,5,8,11 - -shuffle4 4,7,3,7 -shuffle4 6,10,4,10 - -reduce -update 6,3,7,4,10,9,5,8,11 - -/* level 4 */ -shuffle2 7,8,10,8 -shuffle2 4,11,7,11 - -vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15 -vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2 - -mul 10,8,7,11 - -shuffle2 6,9,4,9 -shuffle2 3,5,6,5 - -reduce -update 3,4,9,6,5,10,8,7,11 - -/* level 5 */ -shuffle1 9,7,5,7 -shuffle1 6,11,9,11 - -vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15 -vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2 - -mul 5,7,9,11 - -shuffle1 3,10,6,10 -shuffle1 4,8,3,8 - -reduce -update 4,6,10,3,8,5,7,9,11 - -/* level 6 */ -vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14 -vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15 -vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8 -vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2 - -mul 10,3,9,11,14,15,8,2 - -reduce -update 8,4,6,5,7,10,3,9,11 - -vmovdqa %ymm8,(128*\off+ 0)*2(%rdi) -vmovdqa %ymm4,(128*\off+ 16)*2(%rdi) -vmovdqa %ymm10,(128*\off+ 32)*2(%rdi) -vmovdqa %ymm3,(128*\off+ 48)*2(%rdi) -vmovdqa %ymm6,(128*\off+ 64)*2(%rdi) -vmovdqa %ymm5,(128*\off+ 80)*2(%rdi) -vmovdqa %ymm9,(128*\off+ 96)*2(%rdi) -vmovdqa %ymm11,(128*\off+112)*2(%rdi) -.endm - -.text -.global ntt_avx -.global _ntt_avx -ntt_avx: -_ntt_avx: -vmovdqa _16XQ*2(%rsi),%ymm0 - -level0 0 -level0 1 - -levels1t6 0 -levels1t6 1 - -ret diff --git a/src/avx2/poly.rs b/src/avx2/poly.rs deleted file mode 100644 index 7f90702..0000000 --- a/src/avx2/poly.rs +++ /dev/null @@ -1,400 +0,0 @@ -#![allow(unused_imports)] -use core::arch::x86_64::*; -use crate::{ - align::*, - cbd::*, - consts::*, - fips202::*, - fips202x4::*, - params::*, - symmetric::*, -}; - -pub(crate) const NOISE_NBLOCKS: usize = - (KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE; - -#[derive(Clone)] -#[repr(C)] -pub union Poly { - pub coeffs: [i16; KYBER_N], - pub vec: [__m256i; (KYBER_N+15)/16] -} - -impl Copy for Poly {} - -impl Poly { - pub fn new() -> Self { - Poly { - coeffs: [0i16; KYBER_N] - } - } -} - -extern { - fn ntt_avx(r: &mut [i16; KYBER_N], q_data: &[i16; 640]); - fn invntt_avx(r: &mut [i16; KYBER_N], q_data: &[i16; 640]); - fn nttunpack_avx(r: &mut [i16; KYBER_N], q_data: &[i16; 640]); - fn basemul_avx( - r: &mut[i16; KYBER_N], - a: &[i16; KYBER_N], - b: &[i16; KYBER_N], - q_data: &[i16; 640] - ); - fn tomont_avx(r: &mut [i16; KYBER_N], q_data: &[i16; 640]); - fn reduce_avx(r: &mut [i16; KYBER_N], q_data: &[i16; 640]); - fn ntttobytes_avx(r: *mut u8 , a: &[i16; KYBER_N] , q_data: &[i16; 640]); - fn nttfrombytes_avx(r: *mut i16, a: *const u8, q_data: &[i16; 640]); -} - -#[cfg(any(feature="kyber512", not(feature="kyber1024")))] -pub unsafe fn poly_compress(r: &mut[u8], a: Poly) -{ - let (mut f0, mut f1, mut f2, mut f3); - let v: __m256i = _mm256_load_si256(QDATA.vec[_16XV/16..].as_ptr()); - let shift1: __m256i = _mm256_set1_epi16(1 << 9); - let mask: __m256i = _mm256_set1_epi16(15); - let shift2: __m256i = _mm256_set1_epi16((16 << 8) + 1); - let permdidx: __m256i = _mm256_set_epi32(7,3,6,2,5,1,4,0); - - for i in 0..KYBER_N/64 { - f0 = _mm256_load_si256(&a.vec[4*i+0]); - f1 = _mm256_load_si256(&a.vec[4*i+1]); - f2 = _mm256_load_si256(&a.vec[4*i+2]); - f3 = _mm256_load_si256(&a.vec[4*i+3]); - f0 = _mm256_mulhi_epi16(f0,v); - f1 = _mm256_mulhi_epi16(f1,v); - f2 = _mm256_mulhi_epi16(f2,v); - f3 = _mm256_mulhi_epi16(f3,v); - f0 = _mm256_mulhrs_epi16(f0,shift1); - f1 = _mm256_mulhrs_epi16(f1,shift1); - f2 = _mm256_mulhrs_epi16(f2,shift1); - f3 = _mm256_mulhrs_epi16(f3,shift1); - f0 = _mm256_and_si256(f0,mask); - f1 = _mm256_and_si256(f1,mask); - f2 = _mm256_and_si256(f2,mask); - f3 = _mm256_and_si256(f3,mask); - f0 = _mm256_packus_epi16(f0,f1); - f2 = _mm256_packus_epi16(f2,f3); - f0 = _mm256_maddubs_epi16(f0,shift2); - f2 = _mm256_maddubs_epi16(f2,shift2); - f0 = _mm256_packus_epi16(f0,f2); - f0 = _mm256_permutevar8x32_epi32(f0,permdidx); - _mm256_storeu_si256(r[32*i..].as_mut_ptr() as *mut __m256i,f0); - } -} -#[cfg(any(feature="kyber512", not(feature="kyber1024")))] -pub unsafe fn poly_decompress(r: &mut Poly, a: &[u8]) -{ - let (mut t, mut f); - let q: __m256i = _mm256_load_si256(QDATA.vec[_16XQ/16..].as_ptr()); - let shufbidx: __m256i = _mm256_set_epi8( - 7,7,7,7,6,6,6,6,5,5,5,5,4,4,4,4, - 3,3,3,3,2,2,2,2,1,1,1,1,0,0,0,0 - ); - let mask: __m256i = _mm256_set1_epi32(0x00F0000F); - let shift: __m256i = _mm256_set1_epi32((128 << 16) + 2048); - - for i in 0..KYBER_N/16 { - t = _mm_loadl_epi64(a[8*i..].as_ptr() as *const __m128i); - f = _mm256_broadcastsi128_si256(t); - f = _mm256_shuffle_epi8(f,shufbidx); - f = _mm256_and_si256(f,mask); - f = _mm256_mullo_epi16(f,shift); - f = _mm256_mulhrs_epi16(f,q); - _mm256_store_si256(&mut r.vec[i],f); - } -} - -#[cfg(feature="kyber1024")] -pub unsafe fn poly_compress(r: &mut[u8], a: Poly) -{ - let (mut f0, mut f1); - let (mut t0, mut t1); - let mut tmp; - let v: __m256i = _mm256_load_si256(&QDATA.vec[_16XV/16]); - let shift1: __m256i = _mm256_set1_epi16(1 << 10); - let mask: __m256i = _mm256_set1_epi16(31); - let shift2: __m256i = _mm256_set1_epi16((32 << 8) + 1); - let shift3: __m256i = _mm256_set1_epi32((1024 << 16) + 1); - let sllvdidx: __m256i = _mm256_set1_epi64x(12); - let shufbidx: __m256i = _mm256_set_epi8( - 8,-1,-1,-1,-1,-1, 4, 3, 2, 1, 0,-1,12,11,10, 9, - -1,12,11,10, 9, 8,-1,-1,-1,-1,-1 ,4, 3, 2, 1, 0 - ); - - for i in 0..(KYBER_N/32) { - f0 = _mm256_load_si256(&a.vec[2*i+0]); - f1 = _mm256_load_si256(&a.vec[2*i+1]); - f0 = _mm256_mulhi_epi16(f0,v); - f1 = _mm256_mulhi_epi16(f1,v); - f0 = _mm256_mulhrs_epi16(f0,shift1); - f1 = _mm256_mulhrs_epi16(f1,shift1); - f0 = _mm256_and_si256(f0,mask); - f1 = _mm256_and_si256(f1,mask); - f0 = _mm256_packus_epi16(f0,f1); - f0 = _mm256_maddubs_epi16(f0,shift2); - f0 = _mm256_madd_epi16(f0,shift3); - f0 = _mm256_sllv_epi32(f0,sllvdidx); - f0 = _mm256_srlv_epi64(f0,sllvdidx); - f0 = _mm256_shuffle_epi8(f0,shufbidx); - t0 = _mm256_castsi256_si128(f0); - t1 = _mm256_extracti128_si256(f0,1); - t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx)); - _mm_storeu_si128(r[20*i+ 0..].as_mut_ptr() as *mut __m128i,t0); - tmp = _mm_cvtsi128_si32(t1); - r[20*i+16..20*i+20].copy_from_slice(&tmp.to_le_bytes()); - } -} - -#[cfg(feature="kyber1024")] -pub unsafe fn poly_decompress(r: &mut Poly, a: &[u8]) -{ - let (mut t, mut f, mut ti); - - let q = _mm256_load_si256(&QDATA.vec[_16XQ/16]); - let shufbidx = _mm256_set_epi8( - 9,9,9,8,8,8,8,7,7,6,6,6,6,5,5,5, - 4,4,4,3,3,3,3,2,2,1,1,1,1,0,0,0 - ); - let mask = _mm256_set_epi16( - 248,1984,62,496,3968,124,992,31, - 248,1984,62,496,3968,124,992,31 - ); - let shift = _mm256_set_epi16( - 128,16,512,64,8,256,32,1024, - 128,16,512,64,8,256,32,1024 - ); - - for i in 0..KYBER_N/16 { - t = _mm_loadl_epi64(a[10*i+0..].as_ptr() as *const __m128i); - ti = i32::from_le_bytes([a[10*i+8], a[10*i+9], 0, 0]); - t = _mm_insert_epi16(t, ti, 4); - f = _mm256_broadcastsi128_si256(t); - f = _mm256_shuffle_epi8(f,shufbidx); - f = _mm256_and_si256(f,mask); - f = _mm256_mullo_epi16(f,shift); - f = _mm256_mulhrs_epi16(f,q); - _mm256_store_si256(r.vec[i..].as_mut_ptr() as *mut __m256i,f); - } -} - -pub fn poly_frombytes(r: &mut Poly, a: &[u8]) -{ - unsafe { - nttfrombytes_avx(r.coeffs.as_mut_ptr(), a.as_ptr(), &QDATA.coeffs); - } -} - -pub fn poly_tobytes(r: &mut[u8], a: Poly) -{ - let mut buf = [0u8; KYBER_POLYBYTES]; - unsafe { ntttobytes_avx(buf.as_mut_ptr(), &a.coeffs, &QDATA.coeffs); } - r[..KYBER_POLYBYTES].copy_from_slice(&buf[..]); -} - -pub unsafe fn poly_frommsg(r: &mut Poly, msg: &[u8]) -{ - let shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0,1,2,3)); - let idx = _mm256_broadcastsi128_si256( - _mm_set_epi8(15,14,11,10,7,6,3,2,13,12,9,8,5,4,1,0) - ); - let hqs: __m256i = _mm256_set1_epi16((KYBER_Q+1) as i16/2); - let f = _mm256_loadu_si256(msg.as_ptr() as *const __m256i); - - let mut frommsg64 = |i: usize, mut g3: __m256i| { - g3 = _mm256_sllv_epi32(g3,shift); - g3 = _mm256_shuffle_epi8(g3,idx); - let mut g0 = _mm256_slli_epi16(g3,12); - let mut g1 = _mm256_slli_epi16(g3,8); - let mut g2 = _mm256_slli_epi16(g3,4); - g0 = _mm256_srai_epi16(g0,15); - g1 = _mm256_srai_epi16(g1,15); - g2 = _mm256_srai_epi16(g2,15); - g3 = _mm256_srai_epi16(g3,15); - g0 = _mm256_and_si256(g0,hqs); // 19 18 17 16 3 2 1 0 - g1 = _mm256_and_si256(g1,hqs); // 23 22 21 20 7 6 5 4 - g2 = _mm256_and_si256(g2,hqs); // 27 26 25 24 11 10 9 8 - g3 = _mm256_and_si256(g3,hqs); // 31 30 29 28 15 14 13 12 - let h0 = _mm256_unpacklo_epi64(g0,g1); - let h2 = _mm256_unpackhi_epi64(g0,g1); - let h1 = _mm256_unpacklo_epi64(g2,g3); - let h3 = _mm256_unpackhi_epi64(g2,g3); - g0 = _mm256_permute2x128_si256(h0,h1,0x20); - g2 = _mm256_permute2x128_si256(h0,h1,0x31); - g1 = _mm256_permute2x128_si256(h2,h3,0x20); - g3 = _mm256_permute2x128_si256(h2,h3,0x31); - - _mm256_store_si256(&mut r.vec[0+2*i+0],g0); - _mm256_store_si256(&mut r.vec[0+2*i+1],g1); - _mm256_store_si256(&mut r.vec[8+2*i+0],g2); - _mm256_store_si256(&mut r.vec[8+2*i+1],g3); - }; - - frommsg64(0, _mm256_shuffle_epi32(f, 0)); - frommsg64(1, _mm256_shuffle_epi32(f, 85)); - frommsg64(2, _mm256_shuffle_epi32(f, 170)); - frommsg64(3, _mm256_shuffle_epi32(f, 255)); -} - -pub fn poly_tomsg(msg: &mut[u8], a: Poly) -{ - unsafe { - let (mut f0, mut f1, mut g0, mut g1); - let hq: __m256i = _mm256_set1_epi16((KYBER_Q - 1) as i16/2); - let hhq: __m256i = _mm256_set1_epi16((KYBER_Q - 1) as i16/4); - - for i in 0..KYBER_N/32 { - f0 = _mm256_load_si256(&a.vec[2*i+0]); - f1 = _mm256_load_si256(&a.vec[2*i+1]); - f0 = _mm256_sub_epi16(hq, f0); - f1 = _mm256_sub_epi16(hq, f1); - g0 = _mm256_srai_epi16(f0, 15); - g1 = _mm256_srai_epi16(f1, 15); - f0 = _mm256_xor_si256(f0, g0); - f1 = _mm256_xor_si256(f1, g1); - f0 = _mm256_sub_epi16(f0, hhq); - f1 = _mm256_sub_epi16(f1, hhq); - f0 = _mm256_packs_epi16(f0, f1); - f0 = _mm256_permute4x64_epi64(f0, 0xD8); - let small = _mm256_movemask_epi8(f0); - msg[4*i..][..4].copy_from_slice(&small.to_ne_bytes()); - } - } -} - -#[cfg(all(any(feature="kyber1024", feature="kyber512"), not(feature="90s")))] -pub fn poly_getnoise_eta2(r: &mut Poly, seed: &[u8], nonce: u8) -{ - let mut buf = Eta2Buf::new(); - unsafe { - prf(&mut buf.coeffs, KYBER_ETA2*KYBER_N/4, seed, nonce); - poly_cbd_eta2(r, &buf.vec); - } -} - -#[cfg(not(feature="90s"))] -pub fn poly_getnoise_eta1_4x( - r0: &mut Poly, r1: &mut Poly, r2: &mut Poly, r3: &mut Poly, seed: &[u8], - nonce0: u8, nonce1: u8, nonce2: u8, nonce3: u8 -) -{ - unsafe { - let mut buf = [Eta4xBuf::new(); 4]; - let mut state = Keccakx4State::new(); - let f = _mm256_loadu_si256(seed.as_ptr() as *const __m256i); - _mm256_store_si256(buf[0].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[1].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[2].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[3].vec.as_mut_ptr(), f); - - buf[0].coeffs[32] = nonce0; - buf[1].coeffs[32] = nonce1; - buf[2].coeffs[32] = nonce2; - buf[3].coeffs[32] = nonce3; - - shake256x4_absorb_once( - &mut state, - &buf[0].coeffs, &buf[1].coeffs, - &buf[2].coeffs, &buf[3].coeffs, - 33 - ); - shake256x4_squeezeblocks(&mut buf, NOISE_NBLOCKS, &mut state); - - poly_cbd_eta1(r0, &buf[0]); - poly_cbd_eta1(r1, &buf[1]); - poly_cbd_eta1(r2, &buf[2]); - poly_cbd_eta1(r3, &buf[3]); - } -} - -#[cfg(all(feature="kyber512", not(feature="90s")))] -pub fn poly_getnoise_eta1122_4x( - r0: &mut Poly, r1: &mut Poly, r2: &mut Poly, r3: &mut Poly, seed: &[u8], - nonce0: u8, nonce1: u8, nonce2: u8, nonce3: u8, -) -{ - let mut buf = [Eta4xBuf::new(); 4]; - let mut state = Keccakx4State::new(); - unsafe { - let f = _mm256_loadu_si256(seed.as_ptr() as *const __m256i); - _mm256_store_si256(buf[0].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[1].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[2].vec.as_mut_ptr(), f); - _mm256_store_si256(buf[3].vec.as_mut_ptr(), f); - - buf[0].coeffs[32] = nonce0; - buf[1].coeffs[32] = nonce1; - buf[2].coeffs[32] = nonce2; - buf[3].coeffs[32] = nonce3; - - shake256x4_absorb_once( - &mut state, - &buf[0].coeffs, &buf[1].coeffs, - &buf[2].coeffs, &buf[3].coeffs, - 33 - ); - shake256x4_squeezeblocks(&mut buf, NOISE_NBLOCKS, &mut state); - - poly_cbd_eta1(r0, &buf[0]); - poly_cbd_eta1(r1, &buf[1]); - poly_cbd_eta2(r2, &buf[2].vec); - poly_cbd_eta2(r3, &buf[3].vec); - } -} - -pub fn poly_ntt(r: &mut Poly) -{ - unsafe { ntt_avx(&mut r.coeffs, &QDATA.coeffs); } -} - -pub fn poly_invntt_tomont(r: &mut Poly) -{ - unsafe { invntt_avx(&mut r.coeffs, &QDATA.coeffs); } -} - -pub fn poly_nttunpack(r: &mut Poly) -{ - unsafe { nttunpack_avx(&mut r.coeffs, &QDATA.coeffs); } -} - -pub fn poly_basemul(r: &mut Poly, a: &Poly, b: &Poly) -{ - unsafe { basemul_avx(&mut r.coeffs, &a.coeffs, &b.coeffs, &QDATA.coeffs); } -} - -pub fn poly_tomont(r: &mut Poly) -{ - unsafe { tomont_avx(&mut r.coeffs, &QDATA.coeffs); } -} - -pub fn poly_reduce(r: &mut Poly) -{ - unsafe { reduce_avx(&mut r.coeffs, &QDATA.coeffs); } -} - -pub fn poly_add(r: &mut Poly, b: &Poly) -{ - let (mut f0, mut f1); - for i in 0..(KYBER_N/16) { - unsafe { - f0 = _mm256_load_si256(&r.vec[i]); - f1 = _mm256_load_si256(&b.vec[i]); - f0 = _mm256_add_epi16(f0, f1); - _mm256_store_si256(&mut r.vec[i] , f0); - } - } -} - -pub fn poly_sub(r: &mut Poly, a: &Poly) -{ - let (mut f0, mut f1); - for i in 0..(KYBER_N/16) { - unsafe { - f0 = _mm256_load_si256(&a.vec[i]); - f1 = _mm256_load_si256(&r.vec[i]); - f0 = _mm256_sub_epi16(f0, f1); - _mm256_store_si256(&mut r.vec[i], f0); - } - } -} - diff --git a/src/avx2/polyvec.rs b/src/avx2/polyvec.rs deleted file mode 100644 index f85c50f..0000000 --- a/src/avx2/polyvec.rs +++ /dev/null @@ -1,277 +0,0 @@ -use core::arch::x86_64::*; -use crate::{ - poly::*, - params::*, - consts::* -}; - -#[derive(Clone)] -pub struct Polyvec { - pub vec: [Poly; KYBER_K] -} - -impl Copy for Polyvec {} - -impl Polyvec { - pub fn new() -> Self { - Polyvec { - vec: [Poly::new(); KYBER_K] - } - } -} - -// #[target_feature(enable = "avx")] -pub unsafe fn poly_compress10(r: &mut[u8], a: &Poly) -{ - let (mut f0, mut f1, mut f2); - let (mut t0, mut t1); - - let v = _mm256_load_si256(QDATA.vec[_16XV/16..].as_ptr()); - let v8 = _mm256_slli_epi16(v,3); - let off = _mm256_set1_epi16(15); - let shift1 = _mm256_set1_epi16(1 << 12); - let mask = _mm256_set1_epi16(1023); - let shift2 = _mm256_set1_epi64x( - ((1024u64 << 48) + (1u64 << 32) + (1024 << 16) + 1) as i64 - ); - let sllvdidx = _mm256_set1_epi64x(12); - let shufbidx = _mm256_set_epi8( - 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9, - -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0 - ); - - for i in 0..(KYBER_N/16) { - f0 = _mm256_load_si256(&a.vec[i]); - f1 = _mm256_mullo_epi16(f0,v8); - f2 = _mm256_add_epi16(f0,off); - f0 = _mm256_slli_epi16(f0,3); - f0 = _mm256_mulhi_epi16(f0,v); - f2 = _mm256_sub_epi16(f1,f2); - f1 = _mm256_andnot_si256(f1,f2); - f1 = _mm256_srli_epi16(f1,15); - f0 = _mm256_sub_epi16(f0,f1); - f0 = _mm256_mulhrs_epi16(f0,shift1); - f0 = _mm256_and_si256(f0,mask); - f0 = _mm256_madd_epi16(f0,shift2); - f0 = _mm256_sllv_epi32(f0,sllvdidx); - f0 = _mm256_srli_epi64(f0,12); - f0 = _mm256_shuffle_epi8(f0,shufbidx); - t0 = _mm256_castsi256_si128(f0); - t1 = _mm256_extracti128_si256(f0,1); - t0 = _mm_blend_epi16(t0,t1,0xE0); - _mm_storeu_si128(r[20*i..].as_mut_ptr() as *mut __m128i,t0); - _mm_storeu_si128(r[20*i+16..].as_mut_ptr() as *mut __m128i, t1); - } -} - -// #[target_feature(enable = "avx")] -pub unsafe fn poly_decompress10(r: &mut Poly, a: &[u8]) -{ - let mut f; - let q = _mm256_set1_epi32(((KYBER_Q as i32) << 16) + 4*KYBER_Q as i32); - let shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7, - 6, 5, 5, 4, 4, 3, 3, 2, - 9, 8, 8, 7, 7, 6, 6, 5, - 4, 3, 3, 2, 2, 1, 1, 0); - let sllvdidx = _mm256_set1_epi64x(4); - let mask = _mm256_set1_epi32((32736 << 16) + 8184); - for i in 0..KYBER_N/16 { - f = _mm256_loadu_si256(a[20*i..].as_ptr() as *const __m256i); - f = _mm256_permute4x64_epi64(f,0x94); - f = _mm256_shuffle_epi8(f,shufbidx); - f = _mm256_sllv_epi32(f,sllvdidx); - f = _mm256_srli_epi16(f,1); - f = _mm256_and_si256(f,mask); - f = _mm256_mulhrs_epi16(f,q); - _mm256_store_si256(&mut r.vec[i],f); - } -} - -// #[target_feature(enable = "avx")] -pub unsafe fn poly_compress11(r: &mut[u8], a: &Poly) -{ - let (mut f0, mut f1, mut f2); - let (mut t0, mut t1); - let v = _mm256_load_si256(QDATA.vec[_16XV/16..].as_ptr()); - let v8 = _mm256_slli_epi16(v,3); - let off = _mm256_set1_epi16(36); - let shift1 = _mm256_set1_epi16(1 << 13); - let mask = _mm256_set1_epi16(2047); - let shift2 = _mm256_set1_epi64x( - ((2048u64 << 48) + (1u64 << 32) + (2048 << 16) + 1) as i64 - ); - let sllvdidx = _mm256_set1_epi64x(10); - let srlvqidx = _mm256_set_epi64x(30,10,30,10); - let shufbidx = _mm256_set_epi8( - 4, 3, 2, 1, 0, 0,-1,-1,-1,-1,10, 9, 8, 7, 6, 5, - -1,-1,-1,-1,-1,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - ); - - for i in 0..KYBER_N/16 { - f0 = _mm256_load_si256(&a.vec[i]); - f1 = _mm256_mullo_epi16(f0,v8); - f2 = _mm256_add_epi16(f0,off); - f0 = _mm256_slli_epi16(f0,3); - f0 = _mm256_mulhi_epi16(f0,v); - f2 = _mm256_sub_epi16(f1,f2); - f1 = _mm256_andnot_si256(f1,f2); - f1 = _mm256_srli_epi16(f1,15); - f0 = _mm256_sub_epi16(f0,f1); - f0 = _mm256_mulhrs_epi16(f0,shift1); - f0 = _mm256_and_si256(f0,mask); - f0 = _mm256_madd_epi16(f0,shift2); - f0 = _mm256_sllv_epi32(f0,sllvdidx); - f1 = _mm256_bsrli_epi128(f0,8); - f0 = _mm256_srlv_epi64(f0,srlvqidx); - f1 = _mm256_slli_epi64(f1,34); - f0 = _mm256_add_epi64(f0,f1); - f0 = _mm256_shuffle_epi8(f0,shufbidx); - t0 = _mm256_castsi256_si128(f0); - t1 = _mm256_extracti128_si256(f0,1); - t0 = _mm_blendv_epi8(t0,t1,_mm256_castsi256_si128(shufbidx)); - _mm_storeu_si128(r[22*i+ 0..].as_mut_ptr() as *mut __m128i,t0); - _mm_storel_epi64(r[22*i+16..].as_mut_ptr() as *mut __m128i,t1); - } -} - -// #[target_feature(enable = "avx")] -pub unsafe fn poly_decompress11(r: &mut Poly, a: &[u8]) -{ - let mut f; - - let q = _mm256_load_si256(QDATA.vec[_16XQ/16..].as_ptr()); - let shufbidx = _mm256_set_epi8( - 13,12,12,11,10, 9, 9, 8, - 8, 7, 6, 5, 5, 4, 4, 3, - 10, 9, 9, 8, 7, 6, 6, 5, - 5, 4, 3, 2, 2, 1, 1, 0 - ); - let srlvdidx = _mm256_set_epi32(0,0,1,0,0,0,1,0); - let srlvqidx = _mm256_set_epi64x(2,0,2,0); - let shift = _mm256_set_epi16(4,32,1,8,32,1,4,32,4,32,1,8,32,1,4,32); - let mask = _mm256_set1_epi16(32752); - - for i in 0..(KYBER_N/16) { - f = _mm256_loadu_si256(a[22*i..].as_ptr() as *const __m256i); - f = _mm256_permute4x64_epi64(f,0x94); - f = _mm256_shuffle_epi8(f,shufbidx); - f = _mm256_srlv_epi32(f,srlvdidx); - f = _mm256_srlv_epi64(f,srlvqidx); - f = _mm256_mullo_epi16(f,shift); - f = _mm256_srli_epi16(f,1); - f = _mm256_and_si256(f,mask); - f = _mm256_mulhrs_epi16(f,q); - _mm256_store_si256(&mut r.vec[i],f); - } -} - - -pub unsafe fn polyvec_compress(r: &mut[u8], a: &Polyvec) -{ - if cfg!(feature="kyber1024") { - for i in 0..KYBER_K { - poly_compress11(&mut r[352*i..], &a.vec[i]); - } - } else { - for i in 0..KYBER_K { - poly_compress10(&mut r[320*i..], &a.vec[i]); - } - } -} - -pub unsafe fn polyvec_decompress(r: &mut Polyvec, a: &[u8]) -{ - if cfg!(feature="kyber1024") { - for i in 0..KYBER_K { - poly_decompress11(&mut r.vec[i], &a[352*i..]); - } - } else { - for i in 0..KYBER_K { - poly_decompress10(&mut r.vec[i], &a[320*i..]); - } - } -} - -pub fn polyvec_tobytes(r: &mut[u8], a: &Polyvec) -{ - for i in 0..KYBER_K { - poly_tobytes(&mut r[i*KYBER_POLYBYTES..], a.vec[i]); - } -} - -pub unsafe fn polyvec_frombytes(r: &mut Polyvec, a: &[u8]) -{ - for i in 0..KYBER_K { - poly_frombytes(&mut r.vec[i], &a[i*KYBER_POLYBYTES..]); - } -} - -// Name: polyvec_ntt -// -// Description: Apply forward NTT to all elements of a vector of polynomials -// -// Arguments: - Polyvec r: in/output vector of polynomials -pub fn polyvec_ntt(r: &mut Polyvec) -{ - for i in 0..KYBER_K { - poly_ntt(&mut r.vec[i]); - } -} - -// Name: polyvec_invntt -// -// Description: Apply inverse NTT to all elements of a vector of polynomials -// -// Arguments: - Polyvec r: in/output vector of polynomials -pub fn polyvec_invntt_tomont(r: &mut Polyvec) -{ - for i in 0..KYBER_K { - poly_invntt_tomont(&mut r.vec[i]); - } -} - -// Name: polyvec_basemul_acc_montgomery -// -// Description: Pointwise multiply elements of a and b and accumulate into r -// -// Arguments: - poly *r: output polynomial -// - const Polyvec a: first input vector of polynomials -// - const Polyvec b: second input vector of polynomials -pub fn polyvec_basemul_acc_montgomery(r: &mut Poly, a: &Polyvec, b: &Polyvec) -{ - let mut t = Poly::new(); - poly_basemul(r, &a.vec[0], &b.vec[0]); - for i in 1..KYBER_K { - poly_basemul(&mut t, &a.vec[i], &b.vec[i]); - poly_add(r, &t); - } -} - -// Name: polyvec_reduce -// -// Description: Applies Barrett reduction to each coefficient -// of each element of a vector of polynomials -// for details of the Barrett reduction see comments in reduce.c -// -// Arguments: - poly *r: input/output polynomial -pub fn polyvec_reduce(r: &mut Polyvec) -{ - for i in 0..KYBER_K { - poly_reduce(&mut r.vec[i]); - } -} - - -// Name: polyvec_add -// -// Description: Add vectors of polynomials -// -// Arguments: - Polyvec r: output vector of polynomials -// - const Polyvec a: first input vector of polynomials -// - const Polyvec b: second input vector of polynomials -pub fn polyvec_add(r: &mut Polyvec, b: &Polyvec) -{ - for i in 0..KYBER_K { - poly_add(&mut r.vec[i], &b.vec[i]); - } -} diff --git a/src/avx2/rejsample.rs b/src/avx2/rejsample.rs deleted file mode 100644 index d37bea8..0000000 --- a/src/avx2/rejsample.rs +++ /dev/null @@ -1,380 +0,0 @@ -use core::arch::x86_64::*; -use crate::{ - params::*, - consts::*, - symmetric::* -}; - -pub(crate) const REJ_UNIFORM_AVX_NBLOCKS: usize = - (12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES; -const REJ_UNIFORM_AVX_BUFLEN: usize = REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES; - -pub unsafe fn _mm256_cmpge_epu16(a: __m256i, b: __m256i) -> __m256i { - _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) -} - -pub unsafe fn _mm_cmpge_epu16(a: __m128i, b: __m128i) -> __m128i { - _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -} - -pub unsafe fn rej_uniform_avx(r: &mut[i16], buf: &[u8]) -> usize { - let mut ctr = 0; - let mut pos = 0; - let mut good: usize; - let (mut val0, mut val1); - let (mut f0, mut f1, mut g0, mut g1, mut g2, mut g3); - let (mut f, mut t, mut pilo, mut pihi); - let qdata_ptr = QDATA.coeffs[_16XQ..].as_ptr(); - let bound = _mm256_load_si256(qdata_ptr as *const __m256i); - let ones = _mm256_set1_epi8(1); - let mask = _mm256_set1_epi16(0xFFF); - let idx8 = _mm256_set_epi8( - 15,14,14,13,12,11,11,10, - 9, 8, 8, 7, 6, 5, 5, 4, - 11,10,10, 9, 8, 7, 7, 6, - 5, 4, 4, 3, 2, 1, 1, 0 - ); - while ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48 { - f0 = _mm256_loadu_si256(buf[pos..].as_ptr() as *const __m256i); - f1 = _mm256_loadu_si256(buf[pos+24..].as_ptr() as *const __m256i); - f0 = _mm256_permute4x64_epi64(f0, 0x94); - f1 = _mm256_permute4x64_epi64(f1, 0x94); - f0 = _mm256_shuffle_epi8(f0, idx8); - f1 = _mm256_shuffle_epi8(f1, idx8); - g0 = _mm256_srli_epi16(f0, 4); - g1 = _mm256_srli_epi16(f1, 4); - f0 = _mm256_blend_epi16(f0, g0, 0xAA); - f1 = _mm256_blend_epi16(f1, g1, 0xAA); - f0 = _mm256_and_si256(f0, mask); - f1 = _mm256_and_si256(f1, mask); - pos += 48; - - g0 = _mm256_cmpgt_epi16(bound, f0); - g1 = _mm256_cmpgt_epi16(bound, f1); - - g0 = _mm256_packs_epi16(g0, g1); - good = _mm256_movemask_epi8(g0) as usize; - - let mut l0 = _mm_loadl_epi64(IDX[(good >> 0) & 0xFF].as_ptr() as * const __m128i); - g0 = _mm256_castsi128_si256(l0); - let mut l1 = _mm_loadl_epi64(IDX[(good >> 8) & 0xFF].as_ptr() as *const __m128i); - g1 = _mm256_castsi128_si256(l1); - - l0 = _mm_loadl_epi64(IDX[(good >> 16) & 0xFF].as_ptr() as *const __m128i); - g0 = _mm256_inserti128_si256(g0, l0, 1); - l1 = _mm_loadl_epi64(IDX[(good >> 24) & 0xFF].as_ptr() as *const __m128i); - g1 = _mm256_inserti128_si256(g1, l1, 1); - - g2 = _mm256_add_epi8(g0, ones); - g3 = _mm256_add_epi8(g1, ones); - g0 = _mm256_unpacklo_epi8(g0, g2); - g1 = _mm256_unpacklo_epi8(g1, g3); - - f0 = _mm256_shuffle_epi8(f0, g0); - f1 = _mm256_shuffle_epi8(f1, g1); - - _mm_storeu_si128(r[ctr..].as_mut_ptr() as *mut __m128i, _mm256_castsi256_si128(f0)); - ctr += _popcnt32(((good >> 0) & 0xFF) as i32) as usize; - _mm_storeu_si128(r[ctr..].as_mut_ptr() as *mut __m128i, _mm256_extracti128_si256(f0, 1)); - ctr += _popcnt32(((good >> 16) & 0xFF) as i32) as usize; - _mm_storeu_si128(r[ctr..].as_mut_ptr() as *mut __m128i, _mm256_castsi256_si128(f1)); - ctr += _popcnt32(((good >> 8) & 0xFF) as i32) as usize; - _mm_storeu_si128(r[ctr..].as_mut_ptr() as *mut __m128i, _mm256_extracti128_si256(f1, 1)); - ctr += _popcnt32(((good >> 24) & 0xFF) as i32) as usize; - } - - while ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12 { - f = _mm_loadu_si128(buf[pos..].as_ptr() as *const __m128i); - f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8)); - t = _mm_srli_epi16(f, 4); - f = _mm_blend_epi16(f, t, 0xAA); - f = _mm_and_si128(f, _mm256_castsi256_si128(mask)); - pos += 12; - - t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f); - good = _mm_movemask_epi8(t) as usize; - - let good = _pext_u32(good as u32, 0x5555) as usize; - pilo = _mm_loadl_epi64(IDX[good][..].as_ptr() as *const __m128i); - pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); - pilo = _mm_unpacklo_epi8(pilo, pihi); - f = _mm_shuffle_epi8(f, pilo); - _mm_storeu_si128(r[ctr..].as_mut_ptr() as *mut __m128i, f); - ctr += _popcnt32(good as i32) as usize; - } - - while ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3 { - val0 = (buf[pos+0] >> 0) as u16 | ((buf[pos+1] as u16) << 8) & 0xFFF; - val1 = (buf[pos+1] >> 4) as u16 | ((buf[pos+2] as u16) << 4); - pos += 3; - - if (val0 as usize) < KYBER_Q { - r[ctr] = val0 as i16; - ctr += 1; - } - if (val1 as usize) < KYBER_Q && ctr < KYBER_N { - r[ctr] = val1 as i16; - ctr += 1; - } - } - ctr -} - -const IDX: [[i8; 8]; 256] = [ - [-1, -1, -1, -1, -1, -1, -1, -1], - [ 0, -1, -1, -1, -1, -1, -1, -1], - [ 2, -1, -1, -1, -1, -1, -1, -1], - [ 0, 2, -1, -1, -1, -1, -1, -1], - [ 4, -1, -1, -1, -1, -1, -1, -1], - [ 0, 4, -1, -1, -1, -1, -1, -1], - [ 2, 4, -1, -1, -1, -1, -1, -1], - [ 0, 2, 4, -1, -1, -1, -1, -1], - [ 6, -1, -1, -1, -1, -1, -1, -1], - [ 0, 6, -1, -1, -1, -1, -1, -1], - [ 2, 6, -1, -1, -1, -1, -1, -1], - [ 0, 2, 6, -1, -1, -1, -1, -1], - [ 4, 6, -1, -1, -1, -1, -1, -1], - [ 0, 4, 6, -1, -1, -1, -1, -1], - [ 2, 4, 6, -1, -1, -1, -1, -1], - [ 0, 2, 4, 6, -1, -1, -1, -1], - [ 8, -1, -1, -1, -1, -1, -1, -1], - [ 0, 8, -1, -1, -1, -1, -1, -1], - [ 2, 8, -1, -1, -1, -1, -1, -1], - [ 0, 2, 8, -1, -1, -1, -1, -1], - [ 4, 8, -1, -1, -1, -1, -1, -1], - [ 0, 4, 8, -1, -1, -1, -1, -1], - [ 2, 4, 8, -1, -1, -1, -1, -1], - [ 0, 2, 4, 8, -1, -1, -1, -1], - [ 6, 8, -1, -1, -1, -1, -1, -1], - [ 0, 6, 8, -1, -1, -1, -1, -1], - [ 2, 6, 8, -1, -1, -1, -1, -1], - [ 0, 2, 6, 8, -1, -1, -1, -1], - [ 4, 6, 8, -1, -1, -1, -1, -1], - [ 0, 4, 6, 8, -1, -1, -1, -1], - [ 2, 4, 6, 8, -1, -1, -1, -1], - [ 0, 2, 4, 6, 8, -1, -1, -1], - [10, -1, -1, -1, -1, -1, -1, -1], - [ 0, 10, -1, -1, -1, -1, -1, -1], - [ 2, 10, -1, -1, -1, -1, -1, -1], - [ 0, 2, 10, -1, -1, -1, -1, -1], - [ 4, 10, -1, -1, -1, -1, -1, -1], - [ 0, 4, 10, -1, -1, -1, -1, -1], - [ 2, 4, 10, -1, -1, -1, -1, -1], - [ 0, 2, 4, 10, -1, -1, -1, -1], - [ 6, 10, -1, -1, -1, -1, -1, -1], - [ 0, 6, 10, -1, -1, -1, -1, -1], - [ 2, 6, 10, -1, -1, -1, -1, -1], - [ 0, 2, 6, 10, -1, -1, -1, -1], - [ 4, 6, 10, -1, -1, -1, -1, -1], - [ 0, 4, 6, 10, -1, -1, -1, -1], - [ 2, 4, 6, 10, -1, -1, -1, -1], - [ 0, 2, 4, 6, 10, -1, -1, -1], - [ 8, 10, -1, -1, -1, -1, -1, -1], - [ 0, 8, 10, -1, -1, -1, -1, -1], - [ 2, 8, 10, -1, -1, -1, -1, -1], - [ 0, 2, 8, 10, -1, -1, -1, -1], - [ 4, 8, 10, -1, -1, -1, -1, -1], - [ 0, 4, 8, 10, -1, -1, -1, -1], - [ 2, 4, 8, 10, -1, -1, -1, -1], - [ 0, 2, 4, 8, 10, -1, -1, -1], - [ 6, 8, 10, -1, -1, -1, -1, -1], - [ 0, 6, 8, 10, -1, -1, -1, -1], - [ 2, 6, 8, 10, -1, -1, -1, -1], - [ 0, 2, 6, 8, 10, -1, -1, -1], - [ 4, 6, 8, 10, -1, -1, -1, -1], - [ 0, 4, 6, 8, 10, -1, -1, -1], - [ 2, 4, 6, 8, 10, -1, -1, -1], - [ 0, 2, 4, 6, 8, 10, -1, -1], - [12, -1, -1, -1, -1, -1, -1, -1], - [ 0, 12, -1, -1, -1, -1, -1, -1], - [ 2, 12, -1, -1, -1, -1, -1, -1], - [ 0, 2, 12, -1, -1, -1, -1, -1], - [ 4, 12, -1, -1, -1, -1, -1, -1], - [ 0, 4, 12, -1, -1, -1, -1, -1], - [ 2, 4, 12, -1, -1, -1, -1, -1], - [ 0, 2, 4, 12, -1, -1, -1, -1], - [ 6, 12, -1, -1, -1, -1, -1, -1], - [ 0, 6, 12, -1, -1, -1, -1, -1], - [ 2, 6, 12, -1, -1, -1, -1, -1], - [ 0, 2, 6, 12, -1, -1, -1, -1], - [ 4, 6, 12, -1, -1, -1, -1, -1], - [ 0, 4, 6, 12, -1, -1, -1, -1], - [ 2, 4, 6, 12, -1, -1, -1, -1], - [ 0, 2, 4, 6, 12, -1, -1, -1], - [ 8, 12, -1, -1, -1, -1, -1, -1], - [ 0, 8, 12, -1, -1, -1, -1, -1], - [ 2, 8, 12, -1, -1, -1, -1, -1], - [ 0, 2, 8, 12, -1, -1, -1, -1], - [ 4, 8, 12, -1, -1, -1, -1, -1], - [ 0, 4, 8, 12, -1, -1, -1, -1], - [ 2, 4, 8, 12, -1, -1, -1, -1], - [ 0, 2, 4, 8, 12, -1, -1, -1], - [ 6, 8, 12, -1, -1, -1, -1, -1], - [ 0, 6, 8, 12, -1, -1, -1, -1], - [ 2, 6, 8, 12, -1, -1, -1, -1], - [ 0, 2, 6, 8, 12, -1, -1, -1], - [ 4, 6, 8, 12, -1, -1, -1, -1], - [ 0, 4, 6, 8, 12, -1, -1, -1], - [ 2, 4, 6, 8, 12, -1, -1, -1], - [ 0, 2, 4, 6, 8, 12, -1, -1], - [10, 12, -1, -1, -1, -1, -1, -1], - [ 0, 10, 12, -1, -1, -1, -1, -1], - [ 2, 10, 12, -1, -1, -1, -1, -1], - [ 0, 2, 10, 12, -1, -1, -1, -1], - [ 4, 10, 12, -1, -1, -1, -1, -1], - [ 0, 4, 10, 12, -1, -1, -1, -1], - [ 2, 4, 10, 12, -1, -1, -1, -1], - [ 0, 2, 4, 10, 12, -1, -1, -1], - [ 6, 10, 12, -1, -1, -1, -1, -1], - [ 0, 6, 10, 12, -1, -1, -1, -1], - [ 2, 6, 10, 12, -1, -1, -1, -1], - [ 0, 2, 6, 10, 12, -1, -1, -1], - [ 4, 6, 10, 12, -1, -1, -1, -1], - [ 0, 4, 6, 10, 12, -1, -1, -1], - [ 2, 4, 6, 10, 12, -1, -1, -1], - [ 0, 2, 4, 6, 10, 12, -1, -1], - [ 8, 10, 12, -1, -1, -1, -1, -1], - [ 0, 8, 10, 12, -1, -1, -1, -1], - [ 2, 8, 10, 12, -1, -1, -1, -1], - [ 0, 2, 8, 10, 12, -1, -1, -1], - [ 4, 8, 10, 12, -1, -1, -1, -1], - [ 0, 4, 8, 10, 12, -1, -1, -1], - [ 2, 4, 8, 10, 12, -1, -1, -1], - [ 0, 2, 4, 8, 10, 12, -1, -1], - [ 6, 8, 10, 12, -1, -1, -1, -1], - [ 0, 6, 8, 10, 12, -1, -1, -1], - [ 2, 6, 8, 10, 12, -1, -1, -1], - [ 0, 2, 6, 8, 10, 12, -1, -1], - [ 4, 6, 8, 10, 12, -1, -1, -1], - [ 0, 4, 6, 8, 10, 12, -1, -1], - [ 2, 4, 6, 8, 10, 12, -1, -1], - [ 0, 2, 4, 6, 8, 10, 12, -1], - [14, -1, -1, -1, -1, -1, -1, -1], - [ 0, 14, -1, -1, -1, -1, -1, -1], - [ 2, 14, -1, -1, -1, -1, -1, -1], - [ 0, 2, 14, -1, -1, -1, -1, -1], - [ 4, 14, -1, -1, -1, -1, -1, -1], - [ 0, 4, 14, -1, -1, -1, -1, -1], - [ 2, 4, 14, -1, -1, -1, -1, -1], - [ 0, 2, 4, 14, -1, -1, -1, -1], - [ 6, 14, -1, -1, -1, -1, -1, -1], - [ 0, 6, 14, -1, -1, -1, -1, -1], - [ 2, 6, 14, -1, -1, -1, -1, -1], - [ 0, 2, 6, 14, -1, -1, -1, -1], - [ 4, 6, 14, -1, -1, -1, -1, -1], - [ 0, 4, 6, 14, -1, -1, -1, -1], - [ 2, 4, 6, 14, -1, -1, -1, -1], - [ 0, 2, 4, 6, 14, -1, -1, -1], - [ 8, 14, -1, -1, -1, -1, -1, -1], - [ 0, 8, 14, -1, -1, -1, -1, -1], - [ 2, 8, 14, -1, -1, -1, -1, -1], - [ 0, 2, 8, 14, -1, -1, -1, -1], - [ 4, 8, 14, -1, -1, -1, -1, -1], - [ 0, 4, 8, 14, -1, -1, -1, -1], - [ 2, 4, 8, 14, -1, -1, -1, -1], - [ 0, 2, 4, 8, 14, -1, -1, -1], - [ 6, 8, 14, -1, -1, -1, -1, -1], - [ 0, 6, 8, 14, -1, -1, -1, -1], - [ 2, 6, 8, 14, -1, -1, -1, -1], - [ 0, 2, 6, 8, 14, -1, -1, -1], - [ 4, 6, 8, 14, -1, -1, -1, -1], - [ 0, 4, 6, 8, 14, -1, -1, -1], - [ 2, 4, 6, 8, 14, -1, -1, -1], - [ 0, 2, 4, 6, 8, 14, -1, -1], - [10, 14, -1, -1, -1, -1, -1, -1], - [ 0, 10, 14, -1, -1, -1, -1, -1], - [ 2, 10, 14, -1, -1, -1, -1, -1], - [ 0, 2, 10, 14, -1, -1, -1, -1], - [ 4, 10, 14, -1, -1, -1, -1, -1], - [ 0, 4, 10, 14, -1, -1, -1, -1], - [ 2, 4, 10, 14, -1, -1, -1, -1], - [ 0, 2, 4, 10, 14, -1, -1, -1], - [ 6, 10, 14, -1, -1, -1, -1, -1], - [ 0, 6, 10, 14, -1, -1, -1, -1], - [ 2, 6, 10, 14, -1, -1, -1, -1], - [ 0, 2, 6, 10, 14, -1, -1, -1], - [ 4, 6, 10, 14, -1, -1, -1, -1], - [ 0, 4, 6, 10, 14, -1, -1, -1], - [ 2, 4, 6, 10, 14, -1, -1, -1], - [ 0, 2, 4, 6, 10, 14, -1, -1], - [ 8, 10, 14, -1, -1, -1, -1, -1], - [ 0, 8, 10, 14, -1, -1, -1, -1], - [ 2, 8, 10, 14, -1, -1, -1, -1], - [ 0, 2, 8, 10, 14, -1, -1, -1], - [ 4, 8, 10, 14, -1, -1, -1, -1], - [ 0, 4, 8, 10, 14, -1, -1, -1], - [ 2, 4, 8, 10, 14, -1, -1, -1], - [ 0, 2, 4, 8, 10, 14, -1, -1], - [ 6, 8, 10, 14, -1, -1, -1, -1], - [ 0, 6, 8, 10, 14, -1, -1, -1], - [ 2, 6, 8, 10, 14, -1, -1, -1], - [ 0, 2, 6, 8, 10, 14, -1, -1], - [ 4, 6, 8, 10, 14, -1, -1, -1], - [ 0, 4, 6, 8, 10, 14, -1, -1], - [ 2, 4, 6, 8, 10, 14, -1, -1], - [ 0, 2, 4, 6, 8, 10, 14, -1], - [12, 14, -1, -1, -1, -1, -1, -1], - [ 0, 12, 14, -1, -1, -1, -1, -1], - [ 2, 12, 14, -1, -1, -1, -1, -1], - [ 0, 2, 12, 14, -1, -1, -1, -1], - [ 4, 12, 14, -1, -1, -1, -1, -1], - [ 0, 4, 12, 14, -1, -1, -1, -1], - [ 2, 4, 12, 14, -1, -1, -1, -1], - [ 0, 2, 4, 12, 14, -1, -1, -1], - [ 6, 12, 14, -1, -1, -1, -1, -1], - [ 0, 6, 12, 14, -1, -1, -1, -1], - [ 2, 6, 12, 14, -1, -1, -1, -1], - [ 0, 2, 6, 12, 14, -1, -1, -1], - [ 4, 6, 12, 14, -1, -1, -1, -1], - [ 0, 4, 6, 12, 14, -1, -1, -1], - [ 2, 4, 6, 12, 14, -1, -1, -1], - [ 0, 2, 4, 6, 12, 14, -1, -1], - [ 8, 12, 14, -1, -1, -1, -1, -1], - [ 0, 8, 12, 14, -1, -1, -1, -1], - [ 2, 8, 12, 14, -1, -1, -1, -1], - [ 0, 2, 8, 12, 14, -1, -1, -1], - [ 4, 8, 12, 14, -1, -1, -1, -1], - [ 0, 4, 8, 12, 14, -1, -1, -1], - [ 2, 4, 8, 12, 14, -1, -1, -1], - [ 0, 2, 4, 8, 12, 14, -1, -1], - [ 6, 8, 12, 14, -1, -1, -1, -1], - [ 0, 6, 8, 12, 14, -1, -1, -1], - [ 2, 6, 8, 12, 14, -1, -1, -1], - [ 0, 2, 6, 8, 12, 14, -1, -1], - [ 4, 6, 8, 12, 14, -1, -1, -1], - [ 0, 4, 6, 8, 12, 14, -1, -1], - [ 2, 4, 6, 8, 12, 14, -1, -1], - [ 0, 2, 4, 6, 8, 12, 14, -1], - [10, 12, 14, -1, -1, -1, -1, -1], - [ 0, 10, 12, 14, -1, -1, -1, -1], - [ 2, 10, 12, 14, -1, -1, -1, -1], - [ 0, 2, 10, 12, 14, -1, -1, -1], - [ 4, 10, 12, 14, -1, -1, -1, -1], - [ 0, 4, 10, 12, 14, -1, -1, -1], - [ 2, 4, 10, 12, 14, -1, -1, -1], - [ 0, 2, 4, 10, 12, 14, -1, -1], - [ 6, 10, 12, 14, -1, -1, -1, -1], - [ 0, 6, 10, 12, 14, -1, -1, -1], - [ 2, 6, 10, 12, 14, -1, -1, -1], - [ 0, 2, 6, 10, 12, 14, -1, -1], - [ 4, 6, 10, 12, 14, -1, -1, -1], - [ 0, 4, 6, 10, 12, 14, -1, -1], - [ 2, 4, 6, 10, 12, 14, -1, -1], - [ 0, 2, 4, 6, 10, 12, 14, -1], - [ 8, 10, 12, 14, -1, -1, -1, -1], - [ 0, 8, 10, 12, 14, -1, -1, -1], - [ 2, 8, 10, 12, 14, -1, -1, -1], - [ 0, 2, 8, 10, 12, 14, -1, -1], - [ 4, 8, 10, 12, 14, -1, -1, -1], - [ 0, 4, 8, 10, 12, 14, -1, -1], - [ 2, 4, 8, 10, 12, 14, -1, -1], - [ 0, 2, 4, 8, 10, 12, 14, -1], - [ 6, 8, 10, 12, 14, -1, -1, -1], - [ 0, 6, 8, 10, 12, 14, -1, -1], - [ 2, 6, 8, 10, 12, 14, -1, -1], - [ 0, 2, 6, 8, 10, 12, 14, -1], - [ 4, 6, 8, 10, 12, 14, -1, -1], - [ 0, 4, 6, 8, 10, 12, 14, -1], - [ 2, 4, 6, 8, 10, 12, 14, -1], - [ 0, 2, 4, 6, 8, 10, 12, 14] -]; \ No newline at end of file diff --git a/src/avx2/shuffle.S b/src/avx2/shuffle.S deleted file mode 100644 index 5dc56df..0000000 --- a/src/avx2/shuffle.S +++ /dev/null @@ -1,261 +0,0 @@ -#include "consts.h" -.include "fq.inc" -.include "shuffle.inc" - -/* -nttpack_avx: -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret -*/ - -.text -nttunpack128_avx: -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global nttunpack_avx -.global _nttunpack_avx -nttunpack_avx: -_nttunpack_avx: -call nttunpack128_avx -add $256,%rdi -call nttunpack128_avx -ret - -ntttobytes128_avx: -#load -vmovdqa (%rsi),%ymm5 -vmovdqa 32(%rsi),%ymm6 -vmovdqa 64(%rsi),%ymm7 -vmovdqa 96(%rsi),%ymm8 -vmovdqa 128(%rsi),%ymm9 -vmovdqa 160(%rsi),%ymm10 -vmovdqa 192(%rsi),%ymm11 -vmovdqa 224(%rsi),%ymm12 - -#csubq -csubq 5,13 -csubq 6,13 -csubq 7,13 -csubq 8,13 -csubq 9,13 -csubq 10,13 -csubq 11,13 -csubq 12,13 - -#bitpack -vpsllw $12,%ymm6,%ymm4 -vpor %ymm4,%ymm5,%ymm4 - -vpsrlw $4,%ymm6,%ymm5 -vpsllw $8,%ymm7,%ymm6 -vpor %ymm5,%ymm6,%ymm5 - -vpsrlw $8,%ymm7,%ymm6 -vpsllw $4,%ymm8,%ymm7 -vpor %ymm6,%ymm7,%ymm6 - -vpsllw $12,%ymm10,%ymm7 -vpor %ymm7,%ymm9,%ymm7 - -vpsrlw $4,%ymm10,%ymm8 -vpsllw $8,%ymm11,%ymm9 -vpor %ymm8,%ymm9,%ymm8 - -vpsrlw $8,%ymm11,%ymm9 -vpsllw $4,%ymm12,%ymm10 -vpor %ymm9,%ymm10,%ymm9 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 - -shuffle2 3,4,8,4 -shuffle2 6,5,3,5 -shuffle2 7,9,6,9 - -shuffle4 8,3,7,3 -shuffle4 6,4,8,4 -shuffle4 5,9,6,9 - -shuffle8 7,8,5,8 -shuffle8 6,3,7,3 -shuffle8 4,9,6,9 - -#store -vmovdqu %ymm5,(%rdi) -vmovdqu %ymm7,32(%rdi) -vmovdqu %ymm6,64(%rdi) -vmovdqu %ymm8,96(%rdi) -vmovdqu %ymm3,128(%rdi) -vmovdqu %ymm9,160(%rdi) - -ret - -.global ntttobytes_avx -.global _ntttobytes_avx -ntttobytes_avx: -_ntttobytes_avx: -#consts -vmovdqa _16XQ*2(%rdx),%ymm0 -call ntttobytes128_avx -add $256,%rsi -add $192,%rdi -call ntttobytes128_avx -ret - -nttfrombytes128_avx: -#load -vmovdqu (%rsi),%ymm4 -vmovdqu 32(%rsi),%ymm5 -vmovdqu 64(%rsi),%ymm6 -vmovdqu 96(%rsi),%ymm7 -vmovdqu 128(%rsi),%ymm8 -vmovdqu 160(%rsi),%ymm9 - -shuffle8 4,7,3,7 -shuffle8 5,8,4,8 -shuffle8 6,9,5,9 - -shuffle4 3,8,6,8 -shuffle4 7,5,3,5 -shuffle4 4,9,7,9 - -shuffle2 6,5,4,5 -shuffle2 8,7,6,7 -shuffle2 3,9,8,9 - -shuffle1 4,7,10,7 -shuffle1 5,8,4,8 -shuffle1 6,9,5,9 - -#bitunpack -vpsrlw $12,%ymm10,%ymm11 -vpsllw $4,%ymm7,%ymm12 -vpor %ymm11,%ymm12,%ymm11 -vpand %ymm0,%ymm10,%ymm10 -vpand %ymm0,%ymm11,%ymm11 - -vpsrlw $8,%ymm7,%ymm12 -vpsllw $8,%ymm4,%ymm13 -vpor %ymm12,%ymm13,%ymm12 -vpand %ymm0,%ymm12,%ymm12 - -vpsrlw $4,%ymm4,%ymm13 -vpand %ymm0,%ymm13,%ymm13 - -vpsrlw $12,%ymm8,%ymm14 -vpsllw $4,%ymm5,%ymm15 -vpor %ymm14,%ymm15,%ymm14 -vpand %ymm0,%ymm8,%ymm8 -vpand %ymm0,%ymm14,%ymm14 - -vpsrlw $8,%ymm5,%ymm15 -vpsllw $8,%ymm9,%ymm1 -vpor %ymm15,%ymm1,%ymm15 -vpand %ymm0,%ymm15,%ymm15 - -vpsrlw $4,%ymm9,%ymm1 -vpand %ymm0,%ymm1,%ymm1 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm11,32(%rdi) -vmovdqa %ymm12,64(%rdi) -vmovdqa %ymm13,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm14,160(%rdi) -vmovdqa %ymm15,192(%rdi) -vmovdqa %ymm1,224(%rdi) - -ret - -.global nttfrombytes_avx -.global _nttfrombytes_avx -nttfrombytes_avx: -_nttfrombytes_avx: -#consts -vmovdqa _16XMASK*2(%rdx),%ymm0 -call nttfrombytes128_avx -add $256,%rdi -add $192,%rsi -call nttfrombytes128_avx -ret diff --git a/src/avx2/shuffle.inc b/src/avx2/shuffle.inc deleted file mode 100644 index 73e9ffe..0000000 --- a/src/avx2/shuffle.inc +++ /dev/null @@ -1,25 +0,0 @@ -.macro shuffle8 r0,r1,r2,r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle4 r0,r1,r2,r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle2 r0,r1,r2,r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm\r2 -vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrld $16,%ymm\r0,%ymm\r0 -vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 -.endm diff --git a/src/avx2/verify.rs b/src/avx2/verify.rs deleted file mode 100644 index 2069fef..0000000 --- a/src/avx2/verify.rs +++ /dev/null @@ -1,43 +0,0 @@ -use core::arch::x86_64::*; - -pub fn verify(a: &[u8], b: &[u8], mut len: usize) -> u8 -{ - let (mut f, mut g); - let mut r: u64; - unsafe { - let mut h = _mm256_setzero_si256(); - for i in 0..(len/32) { - f = _mm256_loadu_si256(a[32*i..].as_ptr() as *const __m256i); - g = _mm256_loadu_si256(b[32*i..].as_ptr() as *const __m256i); - f = _mm256_xor_si256(f,g); - h = _mm256_or_si256(h,f); - } - r = 1 - _mm256_testz_si256(h,h) as u64; - } - let idx = 32*(len/32); - len -= idx; - for i in 0..len { - r |= (a[idx+i] ^ b[idx+i]) as u64; - } - (r.wrapping_neg() >> 63) as u8 -} - -pub fn cmov(r: &mut[u8], x: &[u8], mut len: usize, mut b: u8) -{ - let (mut xvec, mut rvec); - unsafe { - let bvec = _mm256_set1_epi64x(b as i64); - for i in 0..(len/32) { - rvec = _mm256_loadu_si256(r[32*i..].as_ptr() as *const __m256i); - xvec = _mm256_loadu_si256(x[32*i..].as_ptr() as *const __m256i); - rvec = _mm256_blendv_epi8(rvec,xvec,bvec); - _mm256_storeu_si256(r[32*i..].as_mut_ptr() as *mut __m256i,rvec); - } - } - let idx = 32*(len/32); - len -= idx; - b = b.wrapping_neg(); - for i in 0..len { - r[idx+i] ^= b & (x[idx+i] ^ r[idx+i]); - } -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 77a04d5..b7ef2d4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,6 @@ //! This library: //! * Is no_std compatible and uses no allocations, suitable for embedded devices. //! * The reference files contain no unsafe code. -//! * On x86_64 platforms uses an optimized avx2 version by default. //! * Compiles to WASM using wasm-bindgen. //! //! ## Features @@ -16,20 +15,12 @@ //! |-----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------| //! | kyber512 | Enables kyber512 mode, with a security level roughly equivalent to AES-128. | //! | kyber1024 | Enables kyber1024 mode, with a security level roughly equivalent to AES-256. | -//! | 90s | 90's mode uses SHA2 and AES-CTR as a replacement for SHAKE. This may provide hardware speedups on certain architectures. | -//! | avx2 | On x86_64 platforms enable the optimized version. This flag is will cause a compile error on other architectures. | //! | wasm | For compiling to WASM targets. | -//! | nasm | Uses Netwide Assembler avx2 code instead of GAS for portability. Requires a nasm compiler: https://www.nasm.us/ | //! | zeroize | This will zero out the key exchange structs on drop using the [zeroize](https://docs.rs/zeroize/latest/zeroize/) crate | //! | std | Enable the standard library | //! //! ## Usage //! -//! For optimisations on x86 platforms enable the `avx2` feature and the following RUSTFLAGS: -//! -//! ```shell -//! export RUSTFLAGS="-C target-feature=+aes,+avx2,+sse2,+sse4.1,+bmi2,+popcnt" -//! ``` //! //! ``` //! use safe_pqc_kyber::*; @@ -126,20 +117,9 @@ #[cfg(all(feature = "kyber1024", feature = "kyber512"))] compile_error!("Only one security level can be specified"); -#[cfg(all(target_arch = "x86_64", feature = "avx2"))] -mod avx2; -#[cfg(all(target_arch = "x86_64", feature = "avx2"))] -use avx2::*; - -#[cfg(any(not(target_arch = "x86_64"), not(feature = "avx2")))] mod reference; -#[cfg(any(not(target_arch = "x86_64"), not(feature = "avx2")))] use reference::*; -#[cfg(any(not(target_arch = "x86_64"), not(feature = "avx2")))] -#[cfg(feature = "hazmat")] -pub use reference::indcpa; - #[cfg(feature = "wasm")] mod wasm; diff --git a/src/params.rs b/src/params.rs index aa09547..14f89b8 100644 --- a/src/params.rs +++ b/src/params.rs @@ -14,14 +14,6 @@ pub const KYBER_K: usize = if cfg!(feature = "kyber512") { 3 }; -/// A boolean flag for whether 90's mode is activated. -/// -/// If true AES-CTR and SHA2 will be used as cryptographic primitives instead, -/// which may have hardware speed-ups on certain platforms. -/// -/// Defaults to false, set`features = ["90s"]` in Cargo.toml to enable. -pub const KYBER_90S: bool = cfg!(feature = "90s"); - pub(crate) const KYBER_N: usize = 256; pub(crate) const KYBER_Q: usize = 3329; diff --git a/src/reference/aes256ctr.rs b/src/reference/aes256ctr.rs deleted file mode 100644 index d72bac9..0000000 --- a/src/reference/aes256ctr.rs +++ /dev/null @@ -1,638 +0,0 @@ -// Translated from the public-domain code by Thomas Pornin as -// found in the Kyber C reference library. -// https://github.com/pq-crystals/kyber/blob/master/ref/aes256ctr.c - -/* - * Copyright (c) 2016 Thomas Pornin - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#![cfg(feature="90s")] - -pub struct Aes256CtrCtx { - pub sk_exp: [u64; 120], - pub ivw: [u32; 16] -} - -impl Aes256CtrCtx { - pub fn new() -> Self { - Self { - sk_exp: [0u64; 120], - ivw: [0u32; 16] - } - } -} - -fn br_dec32le(src: &[u8]) -> u32 -{ - src[0] as u32 - | (src[1] as u32) << 8 - | (src[2] as u32) << 16 - | (src[3] as u32) << 24 -} - -fn br_range_dec32le(v: &mut [u32], mut num: usize, src: &[u8]) -{ - let mut v_idx: usize = 0; - let mut src_idx: usize = 0; - while num > 0 { - num -= 1; - v[v_idx] = br_dec32le(&src[src_idx..]); - v_idx += 1; - src_idx += 4; - } -} - -fn br_swap32(mut x: u32) -> u32 -{ - x = ((x & 0x00FF00FFu32 ) << 8) | ((x >> 8) & 0x00FF00FFu32); - (x << 16) | (x >> 16) -} - -fn br_enc32le(dst: &mut [u8], x: u32) -{ - dst[0] = x as u8; - dst[1] = (x >> 8) as u8; - dst[2] = (x >> 16) as u8; - dst[3] = (x >> 24) as u8; -} - -fn br_range_enc32le(dst: &mut [u8], v: &[u32], mut num: usize) -{ - let mut v_idx = 0; - let mut dst_idx = 0; - while num > 0 { - br_enc32le(&mut dst[dst_idx..], v[v_idx]); - v_idx += 1; - dst_idx += 4; - num -= 1; - } -} - -fn br_aes_ct64_bitslice_sbox(q: &mut [u64]) -{ - // This S-box implementation is a straightforward translation of - // the circuit described by Boyar and Peralta in "A new - // combinational logic minimization technique with applications - // to cryptology" (https://eprint.iacr.org/2009/191.pdf). - // Note that variables x(input) and s(output) are numbered - // in "reverse" order (x0 is the high bit, x7 is the low bit). - let (x0, x1, x2, x3, x4, x5, x6, x7): (u64, u64, u64, u64, u64, u64, u64, u64); - let (y1, y2, y3, y4, y5, y6, y7, y8, y9): (u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (y10, y11, y12, y13, y14, y15, y16, y17, y18, y19): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64) ; - let (y20, y21): (u64, u64) ; - let (z0, z1, z2, z3, z4, z5, z6, z7, z8, z9): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (z10, z11, z12, z13, z14, z15, z16, z17): (u64, u64, u64, u64, u64, u64, u64, u64); - let (t0, t1, t2, t3, t4, t5, t6, t7, t8, t9): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (t10, t11, t12, t13, t14, t15, t16, t17, t18, t19): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (t20, t21, t22, t23, t24, t25, t26, t27, t28, t29): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (t30, t31, t32, t33, t34, t35, t36, t37, t38, t39): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (t40, t41, t42, t43, t44, t45, t46, t47, t48, t49): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (t50, t51, t52, t53, t54, t55, t56, t57, t58, t59): (u64, u64, u64, u64, u64, u64, u64, u64, u64, u64); - let (t60, t61, t62, t63, t64, t65, t66, t67): (u64, u64, u64, u64, u64, u64, u64, u64); - let (s0, s1, s2, s3, s4, s5, s6, s7): (u64, u64, u64, u64, u64, u64, u64, u64); - - x0 = q[7]; - x1 = q[6]; - x2 = q[5]; - x3 = q[4]; - x4 = q[3]; - x5 = q[2]; - x6 = q[1]; - x7 = q[0]; - - // Top linear transformation. - y14 = x3 ^ x5; - y13 = x0 ^ x6; - y9 = x0 ^ x3; - y8 = x0 ^ x5; - t0 = x1 ^ x2; - y1 = t0 ^ x7; - y4 = y1 ^ x3; - y12 = y13 ^ y14; - y2 = y1 ^ x0; - y5 = y1 ^ x6; - y3 = y5 ^ y8; - t1 = x4 ^ y12; - y15 = t1 ^ x5; - y20 = t1 ^ x1; - y6 = y15 ^ x7; - y10 = y15 ^ t0; - y11 = y20 ^ y9; - y7 = x7 ^ y11; - y17 = y10 ^ y11; - y19 = y10 ^ y8; - y16 = t0 ^ y11; - y21 = y13 ^ y16; - y18 = x0 ^ y16; - - // Non-linear section. - t2 = y12 & y15; - t3 = y3 & y6; - t4 = t3 ^ t2; - t5 = y4 & x7; - t6 = t5 ^ t2; - t7 = y13 & y16; - t8 = y5 & y1; - t9 = t8 ^ t7; - t10 = y2 & y7; - t11 = t10 ^ t7; - t12 = y9 & y11; - t13 = y14 & y17; - t14 = t13 ^ t12; - t15 = y8 & y10; - t16 = t15 ^ t12; - t17 = t4 ^ t14; - t18 = t6 ^ t16; - t19 = t9 ^ t14; - t20 = t11 ^ t16; - t21 = t17 ^ y20; - t22 = t18 ^ y19; - t23 = t19 ^ y21; - t24 = t20 ^ y18; - - t25 = t21 ^ t22; - t26 = t21 & t23; - t27 = t24 ^ t26; - t28 = t25 & t27; - t29 = t28 ^ t22; - t30 = t23 ^ t24; - t31 = t22 ^ t26; - t32 = t31 & t30; - t33 = t32 ^ t24; - t34 = t23 ^ t33; - t35 = t27 ^ t33; - t36 = t24 & t35; - t37 = t36 ^ t34; - t38 = t27 ^ t36; - t39 = t29 & t38; - t40 = t25 ^ t39; - - t41 = t40 ^ t37; - t42 = t29 ^ t33; - t43 = t29 ^ t40; - t44 = t33 ^ t37; - t45 = t42 ^ t41; - z0 = t44 & y15; - z1 = t37 & y6; - z2 = t33 & x7; - z3 = t43 & y16; - z4 = t40 & y1; - z5 = t29 & y7; - z6 = t42 & y11; - z7 = t45 & y17; - z8 = t41 & y10; - z9 = t44 & y12; - z10 = t37 & y3; - z11 = t33 & y4; - z12 = t43 & y13; - z13 = t40 & y5; - z14 = t29 & y2; - z15 = t42 & y9; - z16 = t45 & y14; - z17 = t41 & y8; - - // Bottom linear transformation. - t46 = z15 ^ z16; - t47 = z10 ^ z11; - t48 = z5 ^ z13; - t49 = z9 ^ z10; - t50 = z2 ^ z12; - t51 = z2 ^ z5; - t52 = z7 ^ z8; - t53 = z0 ^ z3; - t54 = z6 ^ z7; - t55 = z16 ^ z17; - t56 = z12 ^ t48; - t57 = t50 ^ t53; - t58 = z4 ^ t46; - t59 = z3 ^ t54; - t60 = t46 ^ t57; - t61 = z14 ^ t57; - t62 = t52 ^ t58; - t63 = t49 ^ t58; - t64 = z4 ^ t59; - t65 = t61 ^ t62; - t66 = z1 ^ t63; - s0 = t59 ^ t63; - s6 = t56 ^ !t62; - s7 = t48 ^ !t60; - t67 = t64 ^ t65; - s3 = t53 ^ t66; - s4 = t51 ^ t66; - s5 = t47 ^ t65; - s1 = t64 ^ !s3; - s2 = t55 ^ !t67; - - q[7] = s0; - q[6] = s1; - q[5] = s2; - q[4] = s3; - q[3] = s4; - q[2] = s5; - q[1] = s6; - q[0] = s7; -} - -fn swapn(cl: u64, ch: u64, s: usize, x: u64, y: &mut u64) -> u64 -{ - let a = x; - let b = *y; - *y = ((a & ch) >> (s)) | (b & ch); // update y - (a & cl) | ((b & cl) << s) // return x -} - -fn swap2(x: u64, y: &mut u64) -> u64 -{ - swapn(0x5555555555555555u64, 0xAAAAAAAAAAAAAAAAu64, 1, x, y) -} - -fn swap4(x: u64, y: &mut u64) -> u64 -{ - swapn(0x3333333333333333u64, 0xCCCCCCCCCCCCCCCCu64, 2, x, y) -} - -fn swap8(x: u64, y: &mut u64) -> u64 -{ - swapn(0x0F0F0F0F0F0F0F0Fu64, 0xF0F0F0F0F0F0F0F0u64, 4, x, y) -} - -fn br_aes_ct64_ortho(q: &mut [u64]) -{ - q[0] = swap2(q[0], &mut q[1]); - q[2] = swap2(q[2], &mut q[3]); - q[4] = swap2(q[4], &mut q[5]); - q[6] = swap2(q[6], &mut q[7]); - - q[0] = swap4(q[0], &mut q[2]); - q[1] = swap4(q[1], &mut q[3]); - q[4] = swap4(q[4], &mut q[6]); - q[5] = swap4(q[5], &mut q[7]); - - q[0] = swap8(q[0], &mut q[4]); - q[1] = swap8(q[1], &mut q[5]); - q[2] = swap8(q[2], &mut q[6]); - q[3] =swap8(q[3], &mut q[7]); -} - -fn br_aes_ct64_interleave_in(q0: &mut u64, q1: &mut u64, w: &[u32]) -{ - let (mut x0, mut x1, mut x2, mut x3): (u64, u64, u64, u64); - - x0 = w[0].into(); - x1 = w[1].into(); - x2 = w[2].into(); - x3 = w[3].into(); - x0 |= x0 << 16; - x1 |= x1 << 16; - x2 |= x2 << 16; - x3 |= x3 << 16; - x0 &= 0x0000FFFF0000FFFFu64; - x1 &= 0x0000FFFF0000FFFFu64; - x2 &= 0x0000FFFF0000FFFFu64; - x3 &= 0x0000FFFF0000FFFFu64; - x0 |= x0 << 8; - x1 |= x1 << 8; - x2 |= x2 << 8; - x3 |= x3 << 8; - x0 &= 0x00FF00FF00FF00FFu64; - x1 &= 0x00FF00FF00FF00FFu64; - x2 &= 0x00FF00FF00FF00FFu64; - x3 &= 0x00FF00FF00FF00FFu64; - *q0 = x0 | (x2 << 8); - *q1 = x1 | (x3 << 8); -} - -fn br_aes_ct64_interleave_out(w: &mut[u32], q0: u64, q1: u64) -{ - let (mut x0, mut x1, mut x2, mut x3): (u64, u64, u64, u64); - - x0 = q0 & 0x00FF00FF00FF00FFu64; - x1 = q1 & 0x00FF00FF00FF00FFu64; - x2 = (q0 >> 8) & 0x00FF00FF00FF00FFu64; - x3 = (q1 >> 8) & 0x00FF00FF00FF00FFu64; - x0 |= x0 >> 8; - x1 |= x1 >> 8; - x2 |= x2 >> 8; - x3 |= x3 >> 8; - x0 &= 0x0000FFFF0000FFFFu64; - x1 &= 0x0000FFFF0000FFFFu64; - x2 &= 0x0000FFFF0000FFFFu64; - x3 &= 0x0000FFFF0000FFFFu64; - w[0] = x0 as u32 | (x0 >> 16) as u32 ; - w[1] = x1 as u32 | (x1 >> 16) as u32 ; - w[2] = x2 as u32 | (x2 >> 16) as u32 ; - w[3] = x3 as u32 | (x3 >> 16) as u32 ; -} - -fn sub_word(x: u32) -> u32 { - let mut q = [0u64; 8]; - q[0] = x.into(); - br_aes_ct64_ortho(&mut q); - br_aes_ct64_bitslice_sbox(&mut q); - br_aes_ct64_ortho(&mut q); - q[0] as u32 -} - -const RCON: [u32; 10] = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36]; - -fn br_aes_ct64_keysched(comp_skey: &mut[u64], key: &[u8]) -{ - let (mut j, mut k) = (0usize, 0usize); - let mut skey = [0u32; 60]; - - let key_len = 32usize; - - let nk = key_len >> 2; - let nkf = (14 + 1) << 2; - br_range_dec32le(&mut skey, (key_len >> 2) as usize, key); - let mut tmp = skey[(key_len >> 2) - 1]; - for i in nk..nkf { - if j == 0 { - tmp = (tmp << 24) | (tmp >> 8); - tmp = sub_word(tmp) ^ RCON[k]; - } else if nk > 6 && j == 4 { - tmp = sub_word(tmp); - } - tmp ^= skey[i - nk]; - skey[i] = tmp; - j += 1; - if j == nk { - j = 0; - k += 1; - } - } - - j = 0; - for idx in (0..nkf).step_by(4) { - let mut q = [0u64; 8]; - - - let (q0, q1) = q.split_at_mut(4); - br_aes_ct64_interleave_in(&mut q0[0], &mut q1[0], &skey[idx..] ); - q[1] = q[0]; - q[2] = q[0]; - q[3] = q[0]; - q[5] = q[4]; - q[6] = q[4]; - q[7] = q[4]; - br_aes_ct64_ortho(&mut q); - comp_skey[j] = - (q[0] & 0x1111111111111111) - | (q[1] & 0x2222222222222222) - | (q[2] & 0x4444444444444444) - | (q[3] & 0x8888888888888888); - comp_skey[j + 1] = - (q[4] & 0x1111111111111111) - | (q[5] & 0x2222222222222222) - | (q[6] & 0x4444444444444444) - | (q[7] & 0x8888888888888888); - j += 2; - } -} - -fn br_aes_ct64_skey_expand(skey: &mut[u64], comp_skey: &[u64]) -{ - const N: usize = 15 << 1; - let mut u = 0; - let mut v = 0; - let mut x0: u64; - let mut x1: u64; - let mut x2: u64; - let mut x3: u64; - while u < N { - x0 = comp_skey[u]; - x1 = comp_skey[u]; - x2 = comp_skey[u]; - x3 = comp_skey[u]; - x0 &= 0x1111111111111111; - x1 &= 0x2222222222222222; - x2 &= 0x4444444444444444; - x3 &= 0x8888888888888888; - x1 >>= 1; - x2 >>= 2; - x3 >>= 3; - skey[v ] = (x0 << 4).wrapping_sub(x0); - skey[v + 1] = (x1 << 4).wrapping_sub(x1); - skey[v + 2] = (x2 << 4).wrapping_sub(x2); - skey[v + 3] = (x3 << 4).wrapping_sub(x3); - v += 4; - u += 1; - } -} - -fn add_round_key(q: &mut[u64], sk: &[u64]) -{ - q[0] ^= sk[0]; - q[1] ^= sk[1]; - q[2] ^= sk[2]; - q[3] ^= sk[3]; - q[4] ^= sk[4]; - q[5] ^= sk[5]; - q[6] ^= sk[6]; - q[7] ^= sk[7]; -} - -fn shift_rows(q: &mut [u64]) -{ - for x in q.iter_mut() { - *x = (*x & 0x000000000000FFFF) - | ((*x & 0x00000000FFF00000) >> 4) - | ((*x & 0x00000000000F0000) << 12) - | ((*x & 0x0000FF0000000000) >> 8) - | ((*x & 0x000000FF00000000) << 8) - | ((*x & 0xF000000000000000) >> 12) - | ((*x & 0x0FFF000000000000) << 4) - }; -} - -fn rotr32(x: u64) -> u64 -{ - (x << 32) | (x >> 32) -} - -fn mix_columns(q: &mut [u64]) -{ - let q0 = q[0]; - let q1 = q[1]; - let q2 = q[2]; - let q3 = q[3]; - let q4 = q[4]; - let q5 = q[5]; - let q6 = q[6]; - let q7 = q[7]; - let r0 = (q0 >> 16) | (q0 << 48); - let r1 = (q1 >> 16) | (q1 << 48); - let r2 = (q2 >> 16) | (q2 << 48); - let r3 = (q3 >> 16) | (q3 << 48); - let r4 = (q4 >> 16) | (q4 << 48); - let r5 = (q5 >> 16) | (q5 << 48); - let r6 = (q6 >> 16) | (q6 << 48); - let r7 = (q7 >> 16) | (q7 << 48); - - q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); - q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); - q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); - q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); - q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); - q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); - q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); - q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); -} - -fn inc4_be(x: u32) -> u32 -{ - let t = br_swap32(x) + 4; - br_swap32(t) -} - -fn aes_ctr4x(out: &mut [u8], ivw: &mut [u32], sk_exp: &[u64]) -{ - let mut w = [0u32; 16]; - w.copy_from_slice(&ivw); - let mut q = [0u64; 8]; - let (q0, q1) = q.split_at_mut(4); - for i in 0..4 { - br_aes_ct64_interleave_in(&mut q0[i], &mut q1[i], &w[(i << 2)..]); - } - br_aes_ct64_ortho(&mut q); - - add_round_key(&mut q, sk_exp); - for i in 1..14 { - br_aes_ct64_bitslice_sbox(&mut q); - shift_rows(&mut q); - mix_columns(&mut q); - add_round_key(&mut q, &sk_exp[(i << 3)..]); - } - br_aes_ct64_bitslice_sbox(&mut q); - shift_rows(&mut q); - add_round_key(&mut q, &sk_exp[112..]); - - br_aes_ct64_ortho(&mut q); - for i in 0..4 { - br_aes_ct64_interleave_out(&mut w[(i << 2)..], q[i], q[i + 4]); - } - br_range_enc32le(out, &w, 16); - - /* Increase counter for next 4 blocks */ - ivw[3] = inc4_be(ivw[3]); - ivw[7] = inc4_be(ivw[7]); - ivw[11] = inc4_be(ivw[11]); - ivw[15] = inc4_be(ivw[15]); -} - -fn br_aes_ct64_ctr_init(sk_exp: &mut [u64], key: &[u8]) -{ - let mut skey = [0u64; 30]; - br_aes_ct64_keysched(&mut skey, key); - br_aes_ct64_skey_expand(sk_exp, &skey); -} - -#[cfg(not(feature="90s-fixslice"))] -fn br_aes_ct64_ctr_run(sk_exp: &mut[u64], iv: &[u8], cc: u32, data: &mut[u8], mut len: usize) -{ - let mut ivw = [0u32; 16]; - br_range_dec32le(&mut ivw, 3, iv); - let mut slice = [0u32; 3]; - slice.copy_from_slice(&ivw[0..3]); - ivw[4..7].copy_from_slice(&slice); - ivw[8..11].copy_from_slice(&slice); - ivw[12..15].copy_from_slice(&slice); - ivw[ 3] = br_swap32(cc); - ivw[ 7] = br_swap32(cc + 1); - ivw[11] = br_swap32(cc + 2); - ivw[15] = br_swap32(cc + 3); - - let mut idx = 0; - while len > 64 { - aes_ctr4x(&mut data[idx..], &mut ivw, sk_exp); - idx += 64; - len -= 64; - } - if len > 0 { - let mut tmp = [0u8; 64]; - aes_ctr4x(&mut tmp, &mut ivw, sk_exp); - data[idx..].copy_from_slice(&tmp[..len]) - } -} - -// Name: aes256_prf -// -// Description: AES256 stream generation in CTR mode using 32-bit counter, -// nonce is zero-padded to 12 bytes, counter starts at zero -// -// Arguments: - [u8] output: output -// - usize outlen: length of requested output in bytes -// - const [u8] key: 32-byte key -// - const u8 nonce: 1-byte nonce (will be zero-padded to 12 bytes) -#[cfg(not(feature="90s-fixslice"))] -pub fn aes256ctr_prf(output: &mut[u8], outlen: usize, key: &[u8], nonce: u8) -{ - let mut sk_exp = [0u64; 120]; - let mut pad_nonce = [0u8; 12]; - pad_nonce[0] = nonce; - br_aes_ct64_ctr_init(&mut sk_exp, key); - br_aes_ct64_ctr_run(&mut sk_exp, &pad_nonce, 0, output, outlen); -} - -// Name: aes256ctr_init -// -// Description: AES256 CTR used as a replacement for a XOF; this function -// "absorbs" a 32-byte key and two additional bytes that are zero-padded -// to a 12-byte nonce -// -// Arguments: - aes256xof_ctx *s: state to "absorb" key and IV into -// - const [u8] key: 32-byte key -// - [u8] nonce: additional bytes to "absorb" -pub fn aes256ctr_init( - s: &mut Aes256CtrCtx, - key: &[u8], - nonce: [u8; 12] -) -{ - br_aes_ct64_ctr_init(&mut s.sk_exp, key); - - br_range_dec32le(&mut s.ivw, 3, &nonce); - let mut slice = [0u32; 3]; - slice.copy_from_slice(&s.ivw[..3]); - s.ivw[4..7].copy_from_slice(&slice); - s.ivw[8..11].copy_from_slice(&slice); - s.ivw[12..15].copy_from_slice(&slice); - s.ivw[ 3] = br_swap32(0); - s.ivw[ 7] = br_swap32(1); - s.ivw[11] = br_swap32(2); - s.ivw[15] = br_swap32(3); -} - -pub fn aes256ctr_squeezeblocks( - out: &mut[u8], - mut nblocks: usize, - s: &mut Aes256CtrCtx -) -{ - let mut idx = 0; - while nblocks > 0 { - aes_ctr4x(&mut out[idx..], &mut s.ivw, &s.sk_exp); - idx += 64; - nblocks -= 1; - } -} \ No newline at end of file diff --git a/src/reference/mod.rs b/src/reference/mod.rs index 16efd7d..7c05a15 100644 --- a/src/reference/mod.rs +++ b/src/reference/mod.rs @@ -1,4 +1,3 @@ -pub mod aes256ctr; pub mod cbd; pub mod fips202; pub mod indcpa; @@ -6,4 +5,4 @@ pub mod ntt; pub mod poly; pub mod polyvec; pub mod reduce; -pub mod verify; \ No newline at end of file +pub mod verify; diff --git a/src/symmetric.rs b/src/symmetric.rs index 8f44588..6bebfee 100644 --- a/src/symmetric.rs +++ b/src/symmetric.rs @@ -1,31 +1,11 @@ #![allow(dead_code)] -#[cfg(feature = "90s")] -use crate::aes256ctr::*; -#[cfg(not(feature = "90s"))] use crate::{fips202::*, params::*}; -#[cfg(feature = "90s")] -use sha2::{Digest, Sha256, Sha512}; -#[cfg(feature = "90s-fixslice")] -use aes::cipher::{generic_array::GenericArray, KeyIvInit, StreamCipher}; -#[cfg(feature = "90s-fixslice")] -type Aes256Ctr = ctr::Ctr32BE; - -#[cfg(feature = "90s")] -pub(crate) const AES256CTR_BLOCKBYTES: usize = 64; - -#[cfg(feature = "90s")] -pub(crate) const XOF_BLOCKBYTES: usize = AES256CTR_BLOCKBYTES; -#[cfg(not(feature = "90s"))] pub(crate) const XOF_BLOCKBYTES: usize = SHAKE128_RATE; -#[cfg(not(feature = "90s"))] pub(crate) type XofState = KeccakState; -#[cfg(feature = "90s")] -pub(crate) type XofState = Aes256CtrCtx; - #[derive(Copy, Clone)] pub(crate) struct KeccakState { pub s: [u64; 25], @@ -47,102 +27,37 @@ impl KeccakState { } // SHA3-256 -#[cfg(not(feature = "90s"))] pub(crate) fn hash_h(out: &mut[u8], input: &[u8], inlen: usize) { sha3_256(out, input, inlen); } -// 90s mode SHA2-256 -#[cfg(feature = "90s")] -pub(crate) fn hash_h(out: &mut[u8], input: &[u8], inlen: usize) -{ - let mut hasher = Sha256::new(); - hasher.update(&input[..inlen]); - let digest = hasher.finalize(); - out[..digest.len()].copy_from_slice(&digest); -} -#[cfg(not(feature = "90s"))] pub(crate) fn hash_g(out: &mut[u8], input: &[u8], inlen: usize) { sha3_512(out, input, inlen); } -#[cfg(feature = "90s")] -pub(crate) fn hash_g(out: &mut[u8], input: &[u8], inlen: usize) -{ - let mut hasher = Sha512::new(); - hasher.update(&input[..inlen]); - let digest = hasher.finalize(); - out[..digest.len()].copy_from_slice(&digest); -} - -#[cfg(not(feature = "90s"))] pub(crate) fn xof_absorb(state: &mut XofState, input: &[u8], x: u8, y: u8) { kyber_shake128_absorb(state, &input, x, y); } -#[cfg(feature = "90s")] -pub(crate) fn xof_absorb(state: &mut XofState, input: &[u8], x: u8, y: u8) -{ - let mut nonce = [0u8; 12]; - nonce[0] = x; - nonce[1] = y; - aes256ctr_init(state, &input, nonce); -} - -#[cfg(not(feature = "90s"))] pub(crate) fn xof_squeezeblocks(out: &mut[u8], outblocks: usize, state: &mut XofState) { kyber_shake128_squeezeblocks(out, outblocks, state); } -#[cfg(feature = "90s")] -pub(crate) fn xof_squeezeblocks(out: &mut[u8], outblocks: usize, state: &mut XofState) -{ - aes256ctr_squeezeblocks(out, outblocks, state); -} - -#[cfg(not(feature = "90s"))] pub(crate) fn prf(out: &mut[u8], outbytes: usize, key: &[u8], nonce: u8) { shake256_prf(out, outbytes, &key, nonce); } -#[cfg(feature = "90s")] -pub fn prf(out: &mut [u8], _outbytes: usize, key: &[u8], nonce: u8) { - #[cfg(feature = "90s-fixslice")] - { - // RustCrypto fixslice - let mut expnonce = [0u8; 16]; - expnonce[0] = nonce; - let key = GenericArray::from_slice(key); - let iv = GenericArray::from_slice(&expnonce); - let mut cipher = Aes256Ctr::new(&key, &iv); - cipher.apply_keystream(out); - return - } - #[cfg(not(feature = "90s-fixslice"))] - // Pornin bitslice - aes256ctr_prf(out, _outbytes, &key, nonce); -} - -#[cfg(not(feature = "90s"))] pub(crate) fn kdf(out: &mut[u8], input: &[u8], inlen: usize) { shake256(out, KYBER_SSBYTES, input, inlen); } -#[cfg(feature = "90s")] -pub(crate) fn kdf(out: &mut[u8], input: &[u8], inlen: usize) -{ - let mut hasher = Sha256::new(); - hasher.update(&input[..inlen]); - let digest = hasher.finalize(); - out[..digest.len()].copy_from_slice(&digest); -} // Name: kyber_shake128_absorb // @@ -152,7 +67,6 @@ pub(crate) fn kdf(out: &mut[u8], input: &[u8], inlen: usize) // - const [u8] input: KYBER_SYMBYTES input to be absorbed into s // - u8 x additional byte of input // - u8 y additional byte of input -#[cfg(not(feature = "90s"))] fn kyber_shake128_absorb( s: &mut KeccakState, input: &[u8], @@ -176,7 +90,6 @@ fn kyber_shake128_absorb( // Arguments: - [u8] output: output blocks // - u64 nblocks: number of blocks to be squeezed (written to output) // - keccak_state *s: in/output Keccak state -#[cfg(not(feature = "90s"))] fn kyber_shake128_squeezeblocks( output: &mut[u8], nblocks: usize, @@ -195,7 +108,6 @@ fn kyber_shake128_squeezeblocks( // - u64 outlen: number of requested output bytes // - const [u8] key: the key (of length KYBER_SYMBYTES) // - const [u8] nonce: single-byte nonce (public PRF input) -#[cfg(not(feature = "90s"))] fn shake256_prf(output: &mut[u8], outlen: usize, key: &[u8], nonce: u8) { let mut extkey = [0u8; KYBER_SYMBYTES+1]; diff --git a/tests/KAT/SHA256SUMS_ORIG b/tests/KAT/SHA256SUMS_ORIG index 807022d..b7a53c6 100644 --- a/tests/KAT/SHA256SUMS_ORIG +++ b/tests/KAT/SHA256SUMS_ORIG @@ -1,6 +1,3 @@ 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85 tvecs512 -2ea81fa2d7e3c1970409b9d77d6c5137aeb4573e856ca79eab4393b70352e85b tvecs512-90s 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6 tvecs768 -a1b8fe37e3fc58a8511c63a7187d3626a1a98c5d3bb67000fe9a02be7199d952 tvecs768-90s ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10 tvecs1024 -f547f5361f933e6489d2385524ffd36893063c6b9cc3f921514b4ebb9daefdaa tvecs1024-90s diff --git a/tests/KAT/readme.md b/tests/KAT/readme.md index 43b03ab..b1bffaa 100644 --- a/tests/KAT/readme.md +++ b/tests/KAT/readme.md @@ -5,11 +5,8 @@ The test vectors need to be generated locally. Running [build_kats.sh](./build_k This results in 6 files, each containing 10000 KATs, total size is ~600MB: * tvecs512 -* tvecs512-90s * tvecs768 -* tvecs768-90s * tvecs1024 -* tvecs1024-90s These need to be then moved into the `tests/KAT` folder. The `SHA256SUMS_ORIG` file contains the digests this library was tested against. @@ -45,4 +42,4 @@ done; # Confirm SHA256SUMS match rust repo KAT's # Please submit a github issue if upstream test vectors have changed diff SHA256SUMS_ORIG SHA256SUMS -``` \ No newline at end of file +``` diff --git a/tests/load/mod.rs b/tests/load/mod.rs index 64ef137..2c9d10e 100644 --- a/tests/load/mod.rs +++ b/tests/load/mod.rs @@ -46,7 +46,7 @@ impl From<&[String]> for Kat { } } -// Get KAT filename based on security level and if 90s mode +// Get KAT filename based on security level fn get_filename() -> String { let mut filename = match KYBER_K { 2 => "tvecs512".to_string(), @@ -54,9 +54,6 @@ fn get_filename() -> String { 4 => "tvecs1024".to_string(), _ => panic!("No security level set") }; - if KYBER_90S { - filename.push_str("-90s"); - } println!("Using KAT file: {}", &filename); filename } @@ -97,4 +94,4 @@ pub fn build_kats() -> Vec { |c| {c.into()} ) .collect::>() -} \ No newline at end of file +} diff --git a/tests/readme.md b/tests/readme.md index 375290a..c949868 100644 --- a/tests/readme.md +++ b/tests/readme.md @@ -9,12 +9,6 @@ cd KAT Which will clone the C reference repo, generate the KAT files, then rename and put them in the correct folder for testing. -To run the known answer tests you will need to enable `kyber_kat` in `RUSTFLAGS`. To check different Kyber levels or 90's mode you will need to include those flags also. eg: -```bash -RUSTFLAGS=' --cfg kyber_kat' cargo test --features "kyber1024 90s" -``` - -For applicible x86 architectures you must export the avx2 RUSTFLAGS if you don't want to test on the reference codebase. To run a matrix of all possible features use the helper script from this folder: ```shell @@ -25,13 +19,11 @@ The script also checks for the existence of different environment variables and its behaviour * KAT: Runs the known answer tests -* AVX2: Runs avx2 code on x86 platforms with compiled GAS files -* NASM: Runs avx2 code with both GAS and NASM files seperately, requires a NASM compiler installed To activate, instantiate the variables, for example: ```shell -KAT=1 AVX2=1 NASM=1 ./run_all_tests.sh +KAT=1 ./run_all_tests.sh ``` Test files: diff --git a/tests/run_all_tests.sh b/tests/run_all_tests.sh index 3db2efa..612622e 100755 --- a/tests/run_all_tests.sh +++ b/tests/run_all_tests.sh @@ -5,12 +5,7 @@ set -e # # Variables: # KAT - Runs the known answer tests -# AVX2 - Runs avx2 code on x86 platforms with compiled GAS files -# NASM - Runs avx2 code with both GAS and NASM files seperately -# When setting AVX2 or NASM flags enable avx2 target features -# and LLVM address sanitser checks (requires nightly): -# export RUSTFLAGS="${RUSTFLAGS:-} -Z sanitizer=address -C target-cpu=native -C target-feature=+aes,+avx2,+sse2,+sse4.1,+bmi2,+popcnt" TARGET=$(rustc -vV | sed -n 's|host: ||p') @@ -25,21 +20,6 @@ if [ -z "$KAT" ] RUSTFLAGS+=" --cfg kyber_kat" fi -if [ -z "$AVX2" ] - then - echo Not using AVX2 optimisations - OPT=("") - else - echo Using AVX2 optimisations with GAS assembler - OPT=("" "avx2") -fi - -if [[ ! -z "$NASM" ]] - then - echo Using AVX2 optimisations with NASM assembler - OPT+=("nasm") -fi - # Print Headers announce(){ title="# $1 #" @@ -54,7 +34,7 @@ start=`date +%s` announce $TARGET LEVELS=("kyber512" "kyber768" "kyber1024") -NINES=("" "90s" "90s-fixslice") +NINES=("") for level in "${LEVELS[@]}"; do for nine in "${NINES[@]}"; do